masster 0.4.0__py3-none-any.whl → 0.4.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- masster/__init__.py +8 -8
- masster/_version.py +1 -1
- masster/chromatogram.py +3 -9
- masster/data/libs/README.md +1 -1
- masster/data/libs/ccm.csv +120 -120
- masster/data/libs/ccm.py +116 -62
- masster/data/libs/central_carbon_README.md +1 -1
- masster/data/libs/urine.py +161 -65
- masster/data/libs/urine_metabolites.csv +4693 -4693
- masster/data/wiff/2025_01_14_VW_7600_LpMx_DBS_CID_2min_TOP15_030msecMS1_005msecReac_CE35_DBS-ON_3.mzML +2 -2
- masster/logger.py +43 -78
- masster/sample/__init__.py +1 -1
- masster/sample/adducts.py +264 -338
- masster/sample/defaults/find_adducts_def.py +8 -21
- masster/sample/defaults/find_features_def.py +1 -6
- masster/sample/defaults/get_spectrum_def.py +1 -5
- masster/sample/defaults/sample_def.py +1 -5
- masster/sample/h5.py +282 -561
- masster/sample/helpers.py +75 -131
- masster/sample/lib.py +17 -42
- masster/sample/load.py +17 -31
- masster/sample/parameters.py +2 -6
- masster/sample/plot.py +27 -88
- masster/sample/processing.py +87 -117
- masster/sample/quant.py +51 -57
- masster/sample/sample.py +90 -103
- masster/sample/sample5_schema.json +44 -44
- masster/sample/save.py +12 -35
- masster/sample/sciex.py +19 -66
- masster/spectrum.py +20 -58
- masster/study/__init__.py +1 -1
- masster/study/defaults/align_def.py +1 -5
- masster/study/defaults/fill_chrom_def.py +1 -5
- masster/study/defaults/fill_def.py +1 -5
- masster/study/defaults/integrate_chrom_def.py +1 -5
- masster/study/defaults/integrate_def.py +1 -5
- masster/study/defaults/study_def.py +25 -58
- masster/study/export.py +207 -233
- masster/study/h5.py +136 -470
- masster/study/helpers.py +202 -495
- masster/study/helpers_optimized.py +13 -40
- masster/study/id.py +110 -213
- masster/study/load.py +143 -230
- masster/study/plot.py +257 -518
- masster/study/processing.py +257 -469
- masster/study/save.py +5 -15
- masster/study/study.py +276 -379
- masster/study/study5_schema.json +96 -96
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/METADATA +1 -1
- masster-0.4.1.dist-info/RECORD +67 -0
- masster-0.4.0.dist-info/RECORD +0 -67
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/WHEEL +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/entry_points.txt +0 -0
- {masster-0.4.0.dist-info → masster-0.4.1.dist-info}/licenses/LICENSE +0 -0
masster/sample/helpers.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
|
1
1
|
from __future__ import annotations
|
|
2
2
|
|
|
3
3
|
import polars as pl
|
|
4
|
+
import numpy as np
|
|
4
5
|
|
|
5
6
|
|
|
6
7
|
# Parameters removed - using hardcoded defaults
|
|
@@ -78,14 +79,10 @@ def _estimate_memory_usage(self):
|
|
|
78
79
|
|
|
79
80
|
# Log the memory usage summary
|
|
80
81
|
if hasattr(self, "logger"):
|
|
81
|
-
self.logger.debug(
|
|
82
|
-
f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB",
|
|
83
|
-
)
|
|
82
|
+
self.logger.debug(f"Total DataFrame memory usage: {memory_usage['total']['mb']:.2f} MB")
|
|
84
83
|
for df_name, stats in memory_usage.items():
|
|
85
84
|
if df_name != "total" and stats["bytes"] > 0:
|
|
86
|
-
self.logger.debug(
|
|
87
|
-
f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB",
|
|
88
|
-
)
|
|
85
|
+
self.logger.debug(f"{df_name}: {stats['rows']} rows, {stats['mb']:.2f} MB")
|
|
89
86
|
|
|
90
87
|
return memory_usage["total"]["mb"]
|
|
91
88
|
|
|
@@ -113,9 +110,7 @@ def _get_scan_uids(self, scans=None, verbose=True):
|
|
|
113
110
|
scans_uids = self.scans_df.get_column("scan_uid").to_list()
|
|
114
111
|
elif isinstance(scans, list):
|
|
115
112
|
# if scans is a list, ensure all elements are valid scan_uids
|
|
116
|
-
scans_uids = [
|
|
117
|
-
s for s in scans if s in self.scans_df.get_column("scan_uid").to_list()
|
|
118
|
-
]
|
|
113
|
+
scans_uids = [s for s in scans if s in self.scans_df.get_column("scan_uid").to_list()]
|
|
119
114
|
if verbose and not scans_uids:
|
|
120
115
|
self.logger.error("No valid scan_uids provided.")
|
|
121
116
|
|
|
@@ -148,9 +143,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
148
143
|
# If features is a list, ensure all elements are valid feature_uids
|
|
149
144
|
if self.features_df is None:
|
|
150
145
|
if verbose:
|
|
151
|
-
self.logger.warning(
|
|
152
|
-
"No features_df available to validate feature UIDs.",
|
|
153
|
-
)
|
|
146
|
+
self.logger.warning("No features_df available to validate feature UIDs.")
|
|
154
147
|
return []
|
|
155
148
|
|
|
156
149
|
valid_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
@@ -171,9 +164,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
171
164
|
|
|
172
165
|
if feature_column is None:
|
|
173
166
|
if verbose:
|
|
174
|
-
self.logger.error(
|
|
175
|
-
"No 'feature_uid' or 'feature_id' column found in polars DataFrame.",
|
|
176
|
-
)
|
|
167
|
+
self.logger.error("No 'feature_uid' or 'feature_id' column found in polars DataFrame.")
|
|
177
168
|
return []
|
|
178
169
|
|
|
179
170
|
# Get unique values from the column
|
|
@@ -199,9 +190,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
199
190
|
|
|
200
191
|
if feature_column is None:
|
|
201
192
|
if verbose:
|
|
202
|
-
self.logger.error(
|
|
203
|
-
"No 'feature_uid' or 'feature_id' column found in pandas DataFrame.",
|
|
204
|
-
)
|
|
193
|
+
self.logger.error("No 'feature_uid' or 'feature_id' column found in pandas DataFrame.")
|
|
205
194
|
return []
|
|
206
195
|
|
|
207
196
|
# Get unique values from the column
|
|
@@ -209,9 +198,7 @@ def _get_feature_uids(self, features=None, verbose=True):
|
|
|
209
198
|
|
|
210
199
|
else:
|
|
211
200
|
if verbose:
|
|
212
|
-
self.logger.error(
|
|
213
|
-
"Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.",
|
|
214
|
-
)
|
|
201
|
+
self.logger.error("Invalid input type. Expected None, list, polars DataFrame, or pandas DataFrame.")
|
|
215
202
|
return []
|
|
216
203
|
|
|
217
204
|
except Exception as e:
|
|
@@ -328,9 +315,7 @@ def get_eic(self, mz, mz_tol=None):
|
|
|
328
315
|
# Filter by mz window
|
|
329
316
|
mz_min = mz - mz_tol
|
|
330
317
|
mz_max = mz + mz_tol
|
|
331
|
-
matches = self.ms1_df.filter(
|
|
332
|
-
(pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max),
|
|
333
|
-
)
|
|
318
|
+
matches = self.ms1_df.filter((pl.col("mz") >= mz_min) & (pl.col("mz") <= mz_max))
|
|
334
319
|
|
|
335
320
|
if len(matches) == 0:
|
|
336
321
|
if hasattr(self, "logger"):
|
|
@@ -340,9 +325,7 @@ def get_eic(self, mz, mz_tol=None):
|
|
|
340
325
|
return None
|
|
341
326
|
|
|
342
327
|
# Aggregate intensities per retention time. Use sum in case multiple points per rt.
|
|
343
|
-
chrom = (
|
|
344
|
-
matches.group_by("rt").agg([pl.col("inty").sum().alias("inty")]).sort("rt")
|
|
345
|
-
)
|
|
328
|
+
chrom = matches.group_by("rt").agg([pl.col("inty").sum().alias("inty")]).sort("rt")
|
|
346
329
|
|
|
347
330
|
# Attach to Sample
|
|
348
331
|
self.chrom_df = chrom
|
|
@@ -408,8 +391,7 @@ def select(
|
|
|
408
391
|
if isinstance(coherence, tuple) and len(coherence) == 2:
|
|
409
392
|
min_coherence, max_coherence = coherence
|
|
410
393
|
feats = feats.filter(
|
|
411
|
-
(pl.col("chrom_coherence") >= min_coherence)
|
|
412
|
-
& (pl.col("chrom_coherence") <= max_coherence),
|
|
394
|
+
(pl.col("chrom_coherence") >= min_coherence) & (pl.col("chrom_coherence") <= max_coherence),
|
|
413
395
|
)
|
|
414
396
|
else:
|
|
415
397
|
feats = feats.filter(pl.col("chrom_coherence") >= coherence)
|
|
@@ -460,8 +442,7 @@ def select(
|
|
|
460
442
|
if isinstance(rt_delta, tuple) and len(rt_delta) == 2:
|
|
461
443
|
min_rt_delta, max_rt_delta = rt_delta
|
|
462
444
|
feats = feats.filter(
|
|
463
|
-
(pl.col("rt_delta") >= min_rt_delta)
|
|
464
|
-
& (pl.col("rt_delta") <= max_rt_delta),
|
|
445
|
+
(pl.col("rt_delta") >= min_rt_delta) & (pl.col("rt_delta") <= max_rt_delta),
|
|
465
446
|
)
|
|
466
447
|
else:
|
|
467
448
|
feats = feats.filter(pl.col("rt_delta") >= rt_delta)
|
|
@@ -538,8 +519,7 @@ def select(
|
|
|
538
519
|
if isinstance(prominence, tuple) and len(prominence) == 2:
|
|
539
520
|
min_prominence, max_prominence = prominence
|
|
540
521
|
feats = feats.filter(
|
|
541
|
-
(pl.col("chrom_prominence") >= min_prominence)
|
|
542
|
-
& (pl.col("chrom_prominence") <= max_prominence),
|
|
522
|
+
(pl.col("chrom_prominence") >= min_prominence) & (pl.col("chrom_prominence") <= max_prominence),
|
|
543
523
|
)
|
|
544
524
|
else:
|
|
545
525
|
feats = feats.filter(pl.col("chrom_prominence") >= prominence)
|
|
@@ -550,9 +530,7 @@ def select(
|
|
|
550
530
|
if height is not None:
|
|
551
531
|
feats_len_before_filter = len(feats)
|
|
552
532
|
# Check if chrom_height column exists, if not use chrom_height_scaled
|
|
553
|
-
height_col =
|
|
554
|
-
"chrom_height" if "chrom_height" in feats.columns else "chrom_height_scaled"
|
|
555
|
-
)
|
|
533
|
+
height_col = "chrom_height" if "chrom_height" in feats.columns else "chrom_height_scaled"
|
|
556
534
|
if isinstance(height, tuple) and len(height) == 2:
|
|
557
535
|
min_height, max_height = height
|
|
558
536
|
feats = feats.filter(
|
|
@@ -573,23 +551,23 @@ def select(
|
|
|
573
551
|
def _features_sync(self):
|
|
574
552
|
"""
|
|
575
553
|
Synchronizes the cached FeatureMap with features_df.
|
|
576
|
-
|
|
577
|
-
This ensures that the cached FeatureMap (_oms_features_map) contains only features
|
|
578
|
-
that exist in both the FeatureMap and the features_df. This is important
|
|
554
|
+
|
|
555
|
+
This ensures that the cached FeatureMap (_oms_features_map) contains only features
|
|
556
|
+
that exist in both the FeatureMap and the features_df. This is important
|
|
579
557
|
after operations that modify features_df but not the FeatureMap (like filtering).
|
|
580
|
-
|
|
558
|
+
|
|
581
559
|
Side Effects:
|
|
582
560
|
Updates self._oms_features_map and self.features_df to contain only common features.
|
|
583
561
|
Logs information about removed features.
|
|
584
562
|
"""
|
|
585
563
|
if self.features_df is None or len(self.features_df) == 0:
|
|
586
564
|
self.logger.debug("No features_df to synchronize")
|
|
587
|
-
if hasattr(self,
|
|
565
|
+
if hasattr(self, '_oms_features_map'):
|
|
588
566
|
self._oms_features_map = None
|
|
589
567
|
return
|
|
590
568
|
|
|
591
569
|
# Check if we have a cached feature map
|
|
592
|
-
if not hasattr(self,
|
|
570
|
+
if not hasattr(self, '_oms_features_map') or self._oms_features_map is None:
|
|
593
571
|
self.logger.debug("No cached feature map to synchronize")
|
|
594
572
|
return
|
|
595
573
|
|
|
@@ -598,26 +576,20 @@ def _features_sync(self):
|
|
|
598
576
|
except ImportError:
|
|
599
577
|
self.logger.warning("PyOpenMS not available, cannot sync FeatureMap")
|
|
600
578
|
return
|
|
601
|
-
|
|
579
|
+
|
|
602
580
|
try:
|
|
603
581
|
# Get feature IDs from both sources
|
|
604
582
|
if "feature_id" in self.features_df.columns:
|
|
605
|
-
df_feature_ids = set(
|
|
606
|
-
self.features_df.get_column("feature_id").cast(str).to_list(),
|
|
607
|
-
)
|
|
583
|
+
df_feature_ids = set(self.features_df.get_column("feature_id").cast(str).to_list())
|
|
608
584
|
else:
|
|
609
|
-
self.logger.warning(
|
|
610
|
-
"No feature_id column in features_df, cannot synchronize",
|
|
611
|
-
)
|
|
585
|
+
self.logger.warning("No feature_id column in features_df, cannot synchronize")
|
|
612
586
|
return
|
|
613
587
|
|
|
614
588
|
# Get feature IDs from FeatureMap
|
|
615
589
|
feature_map_ids = set()
|
|
616
590
|
for i in range(self._oms_features_map.size()):
|
|
617
591
|
feature = self._oms_features_map[i]
|
|
618
|
-
unique_id = str(
|
|
619
|
-
feature.getUniqueId(),
|
|
620
|
-
) # Convert to string to match DataFrame
|
|
592
|
+
unique_id = str(feature.getUniqueId()) # Convert to string to match DataFrame
|
|
621
593
|
feature_map_ids.add(unique_id)
|
|
622
594
|
|
|
623
595
|
# Find features that exist in both
|
|
@@ -715,7 +687,7 @@ def features_delete(self, features: list | None = None):
|
|
|
715
687
|
)
|
|
716
688
|
|
|
717
689
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
718
|
-
if hasattr(self,
|
|
690
|
+
if hasattr(self, '_oms_features_map') and self._oms_features_map is not None:
|
|
719
691
|
try:
|
|
720
692
|
# Import pyopenms
|
|
721
693
|
import pyopenms as oms
|
|
@@ -724,9 +696,7 @@ def features_delete(self, features: list | None = None):
|
|
|
724
696
|
filtered_map = oms.FeatureMap()
|
|
725
697
|
|
|
726
698
|
# Get the feature UIDs that should remain after deletion
|
|
727
|
-
remaining_feature_uids = self.features_df.get_column(
|
|
728
|
-
"feature_uid",
|
|
729
|
-
).to_list()
|
|
699
|
+
remaining_feature_uids = self.features_df.get_column("feature_uid").to_list()
|
|
730
700
|
|
|
731
701
|
# Iterate through existing features and keep only those not in deletion list
|
|
732
702
|
for i in range(self._oms_features_map.size()):
|
|
@@ -738,16 +708,12 @@ def features_delete(self, features: list | None = None):
|
|
|
738
708
|
|
|
739
709
|
# Replace the original FeatureMap with the filtered one
|
|
740
710
|
self._oms_features_map = filtered_map
|
|
741
|
-
self.logger.debug(
|
|
742
|
-
f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.",
|
|
743
|
-
)
|
|
711
|
+
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
744
712
|
|
|
745
713
|
except ImportError:
|
|
746
714
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
747
715
|
except Exception as e:
|
|
748
|
-
self.logger.warning(
|
|
749
|
-
f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.",
|
|
750
|
-
)
|
|
716
|
+
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
751
717
|
|
|
752
718
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
753
719
|
if hasattr(self, "scans_df") and self.scans_df is not None:
|
|
@@ -759,9 +725,7 @@ def features_delete(self, features: list | None = None):
|
|
|
759
725
|
)
|
|
760
726
|
|
|
761
727
|
deleted_count = original_count - len(self.features_df)
|
|
762
|
-
self.logger.info(
|
|
763
|
-
f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}",
|
|
764
|
-
)
|
|
728
|
+
self.logger.info(f"Deleted {deleted_count} features. Remaining features: {len(self.features_df)}")
|
|
765
729
|
|
|
766
730
|
|
|
767
731
|
def _delete_ms2(self):
|
|
@@ -784,19 +748,14 @@ def _delete_ms2(self):
|
|
|
784
748
|
self.logger.debug("Unlinking MS2 spectra from features...")
|
|
785
749
|
|
|
786
750
|
# Set ms2_scans and ms2_specs to None using Polars syntax
|
|
787
|
-
self.features_df = self.features_df.with_columns(
|
|
788
|
-
|
|
789
|
-
|
|
790
|
-
|
|
791
|
-
],
|
|
792
|
-
)
|
|
751
|
+
self.features_df = self.features_df.with_columns([
|
|
752
|
+
pl.lit(None).alias("ms2_scans"),
|
|
753
|
+
pl.lit(None).alias("ms2_specs"),
|
|
754
|
+
])
|
|
793
755
|
|
|
794
756
|
# Update scans_df to remove feature_uid association for linked MS2 spectra
|
|
795
757
|
self.scans_df = self.scans_df.with_columns(
|
|
796
|
-
pl.when(pl.col("ms_level") == 2)
|
|
797
|
-
.then(None)
|
|
798
|
-
.otherwise(pl.col("feature_uid"))
|
|
799
|
-
.alias("feature_uid"),
|
|
758
|
+
pl.when(pl.col("ms_level") == 2).then(None).otherwise(pl.col("feature_uid")).alias("feature_uid"),
|
|
800
759
|
)
|
|
801
760
|
self.logger.info("MS2 spectra unlinked from features.")
|
|
802
761
|
|
|
@@ -828,9 +787,7 @@ def features_filter(self, features):
|
|
|
828
787
|
return
|
|
829
788
|
|
|
830
789
|
if features is None:
|
|
831
|
-
self.logger.warning(
|
|
832
|
-
"No features specified to keep. Use features_delete() to delete all features.",
|
|
833
|
-
)
|
|
790
|
+
self.logger.warning("No features specified to keep. Use features_delete() to delete all features.")
|
|
834
791
|
return
|
|
835
792
|
|
|
836
793
|
# Get the feature UIDs to keep
|
|
@@ -852,7 +809,7 @@ def features_filter(self, features):
|
|
|
852
809
|
feature_uids_to_delete = list(all_feature_uids - set(feature_uids_to_keep))
|
|
853
810
|
|
|
854
811
|
# Update the OpenMS FeatureMap by creating a new one with only features to keep
|
|
855
|
-
if hasattr(self,
|
|
812
|
+
if hasattr(self, '_oms_features_map') and self._oms_features_map is not None:
|
|
856
813
|
try:
|
|
857
814
|
# Import pyopenms
|
|
858
815
|
import pyopenms as oms
|
|
@@ -870,23 +827,15 @@ def features_filter(self, features):
|
|
|
870
827
|
|
|
871
828
|
# Replace the original FeatureMap with the filtered one
|
|
872
829
|
self._oms_features_map = filtered_map
|
|
873
|
-
self.logger.debug(
|
|
874
|
-
f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.",
|
|
875
|
-
)
|
|
830
|
+
self.logger.debug(f"OpenMS FeatureMap updated with {filtered_map.size()} remaining features.")
|
|
876
831
|
|
|
877
832
|
except ImportError:
|
|
878
833
|
self.logger.warning("PyOpenMS not available, only updating features_df")
|
|
879
834
|
except Exception as e:
|
|
880
|
-
self.logger.warning(
|
|
881
|
-
f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.",
|
|
882
|
-
)
|
|
835
|
+
self.logger.warning(f"Could not update OpenMS FeatureMap: {e}. FeatureMap may be out of sync.")
|
|
883
836
|
|
|
884
837
|
# Update scans_df to remove feature_uid associations for deleted features
|
|
885
|
-
if (
|
|
886
|
-
hasattr(self, "scans_df")
|
|
887
|
-
and self.scans_df is not None
|
|
888
|
-
and feature_uids_to_delete
|
|
889
|
-
):
|
|
838
|
+
if hasattr(self, "scans_df") and self.scans_df is not None and feature_uids_to_delete:
|
|
890
839
|
self.scans_df = self.scans_df.with_columns(
|
|
891
840
|
pl.when(pl.col("feature_uid").is_in(feature_uids_to_delete))
|
|
892
841
|
.then(None)
|
|
@@ -896,9 +845,7 @@ def features_filter(self, features):
|
|
|
896
845
|
|
|
897
846
|
kept_count = len(self.features_df)
|
|
898
847
|
deleted_count = original_count - kept_count
|
|
899
|
-
self.logger.info(
|
|
900
|
-
f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}",
|
|
901
|
-
)
|
|
848
|
+
self.logger.info(f"Kept {kept_count} features, deleted {deleted_count} features. Remaining features: {kept_count}")
|
|
902
849
|
|
|
903
850
|
|
|
904
851
|
def set_source(self, filename):
|
|
@@ -942,9 +889,7 @@ def set_source(self, filename):
|
|
|
942
889
|
|
|
943
890
|
# Log the change
|
|
944
891
|
if old_file_source is not None:
|
|
945
|
-
self.logger.info(
|
|
946
|
-
f"Updated file_source from {old_file_source} to {self.file_source}",
|
|
947
|
-
)
|
|
892
|
+
self.logger.info(f"Updated file_source from {old_file_source} to {self.file_source}")
|
|
948
893
|
else:
|
|
949
894
|
self.logger.info(f"Set file_source to {self.file_source}")
|
|
950
895
|
|
|
@@ -952,90 +897,89 @@ def set_source(self, filename):
|
|
|
952
897
|
def _recreate_feature_map(self):
|
|
953
898
|
"""
|
|
954
899
|
Recreate OpenMS FeatureMap from features_df.
|
|
955
|
-
|
|
900
|
+
|
|
956
901
|
This helper function creates a new OpenMS FeatureMap using the data from features_df.
|
|
957
902
|
This allows us to avoid storing and loading featureXML files by default, while still
|
|
958
903
|
being able to recreate the feature map when needed for OpenMS operations like
|
|
959
904
|
find_features() or saving to featureXML format.
|
|
960
|
-
|
|
905
|
+
|
|
961
906
|
Returns:
|
|
962
907
|
oms.FeatureMap: A new FeatureMap with features from features_df, or None if no features
|
|
963
|
-
|
|
908
|
+
|
|
964
909
|
Side Effects:
|
|
965
910
|
Caches the created feature map in self._oms_features_map for reuse
|
|
966
911
|
"""
|
|
967
912
|
if self.features_df is None or len(self.features_df) == 0:
|
|
968
913
|
self.logger.debug("No features_df available to recreate feature map")
|
|
969
914
|
return None
|
|
970
|
-
|
|
915
|
+
|
|
971
916
|
try:
|
|
972
917
|
import pyopenms as oms
|
|
973
918
|
except ImportError:
|
|
974
919
|
self.logger.warning("PyOpenMS not available, cannot recreate feature map")
|
|
975
920
|
return None
|
|
976
|
-
|
|
921
|
+
|
|
977
922
|
# Create new FeatureMap
|
|
978
923
|
feature_map = oms.FeatureMap()
|
|
979
|
-
|
|
924
|
+
|
|
980
925
|
# Set the primary MS run path if available
|
|
981
|
-
if hasattr(self,
|
|
926
|
+
if hasattr(self, 'file_path') and self.file_path:
|
|
982
927
|
feature_map.setPrimaryMSRunPath([self.file_path.encode()])
|
|
983
|
-
|
|
928
|
+
|
|
984
929
|
# Convert DataFrame features to OpenMS Features
|
|
985
930
|
for i, feature_row in enumerate(self.features_df.iter_rows(named=True)):
|
|
986
931
|
feature = oms.Feature()
|
|
987
|
-
|
|
932
|
+
|
|
988
933
|
# Set basic properties from DataFrame (handle missing values gracefully)
|
|
989
934
|
try:
|
|
990
|
-
if feature_row.get(
|
|
991
|
-
feature.setUniqueId(int(feature_row[
|
|
935
|
+
if feature_row.get('feature_id') is not None:
|
|
936
|
+
feature.setUniqueId(int(feature_row['feature_id']))
|
|
992
937
|
else:
|
|
993
938
|
feature.setUniqueId(i) # Use index as fallback
|
|
994
|
-
|
|
995
|
-
if feature_row.get(
|
|
996
|
-
feature.setMZ(float(feature_row[
|
|
997
|
-
if feature_row.get(
|
|
998
|
-
feature.setRT(float(feature_row[
|
|
999
|
-
if feature_row.get(
|
|
1000
|
-
feature.setIntensity(float(feature_row[
|
|
1001
|
-
if feature_row.get(
|
|
1002
|
-
feature.setOverallQuality(float(feature_row[
|
|
1003
|
-
if feature_row.get(
|
|
1004
|
-
feature.setCharge(int(feature_row[
|
|
1005
|
-
|
|
939
|
+
|
|
940
|
+
if feature_row.get('mz') is not None:
|
|
941
|
+
feature.setMZ(float(feature_row['mz']))
|
|
942
|
+
if feature_row.get('rt') is not None:
|
|
943
|
+
feature.setRT(float(feature_row['rt']))
|
|
944
|
+
if feature_row.get('inty') is not None:
|
|
945
|
+
feature.setIntensity(float(feature_row['inty']))
|
|
946
|
+
if feature_row.get('quality') is not None:
|
|
947
|
+
feature.setOverallQuality(float(feature_row['quality']))
|
|
948
|
+
if feature_row.get('charge') is not None:
|
|
949
|
+
feature.setCharge(int(feature_row['charge']))
|
|
950
|
+
|
|
1006
951
|
# Add to feature map
|
|
1007
952
|
feature_map.push_back(feature)
|
|
1008
|
-
|
|
953
|
+
|
|
1009
954
|
except (ValueError, TypeError) as e:
|
|
1010
955
|
self.logger.warning(f"Skipping feature due to conversion error: {e}")
|
|
1011
956
|
continue
|
|
1012
|
-
|
|
957
|
+
|
|
1013
958
|
# Ensure unique IDs
|
|
1014
959
|
feature_map.ensureUniqueId()
|
|
1015
|
-
|
|
960
|
+
|
|
1016
961
|
# Cache the feature map
|
|
1017
962
|
self._oms_features_map = feature_map
|
|
1018
|
-
|
|
1019
|
-
self.logger.debug(
|
|
1020
|
-
f"Recreated FeatureMap with {feature_map.size()} features from features_df",
|
|
1021
|
-
)
|
|
963
|
+
|
|
964
|
+
self.logger.debug(f"Recreated FeatureMap with {feature_map.size()} features from features_df")
|
|
1022
965
|
return feature_map
|
|
1023
966
|
|
|
1024
967
|
|
|
1025
968
|
def _get_feature_map(self):
|
|
1026
969
|
"""
|
|
1027
970
|
Get the OpenMS FeatureMap, creating it from features_df if needed.
|
|
1028
|
-
|
|
971
|
+
|
|
1029
972
|
This property-like method returns the cached feature map if available,
|
|
1030
|
-
or recreates it from features_df if not. This allows lazy loading of
|
|
973
|
+
or recreates it from features_df if not. This allows lazy loading of
|
|
1031
974
|
feature maps only when needed for OpenMS operations.
|
|
1032
|
-
|
|
975
|
+
|
|
1033
976
|
Returns:
|
|
1034
977
|
oms.FeatureMap or None: The feature map, or None if not available
|
|
1035
978
|
"""
|
|
1036
979
|
# Return cached feature map if available
|
|
1037
|
-
if hasattr(self,
|
|
980
|
+
if hasattr(self, '_oms_features_map') and self._oms_features_map is not None:
|
|
1038
981
|
return self._oms_features_map
|
|
1039
|
-
|
|
982
|
+
|
|
1040
983
|
# Otherwise recreate from features_df
|
|
1041
984
|
return self._recreate_feature_map()
|
|
985
|
+
|
masster/sample/lib.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
1
|
"""
|
|
2
2
|
lib.py
|
|
3
3
|
|
|
4
|
-
This module provides the Lib class and utility functions for mass spectrometry compound library
|
|
5
|
-
management and feature annotation. It contains core functionality for compound library management,
|
|
4
|
+
This module provides the Lib class and utility functions for mass spectrometry compound library
|
|
5
|
+
management and feature annotation. It contains core functionality for compound library management,
|
|
6
6
|
target identification, adduct handling, and various analytical operations.
|
|
7
7
|
|
|
8
8
|
Key Features:
|
|
@@ -34,7 +34,7 @@ Supported Adducts:
|
|
|
34
34
|
|
|
35
35
|
Example Usage:
|
|
36
36
|
```python
|
|
37
|
-
from
|
|
37
|
+
from masster.sample.lib import Lib
|
|
38
38
|
|
|
39
39
|
# Create library instance
|
|
40
40
|
lib = Lib()
|
|
@@ -63,7 +63,7 @@ import pyopenms as oms
|
|
|
63
63
|
|
|
64
64
|
from tqdm import tqdm
|
|
65
65
|
|
|
66
|
-
from
|
|
66
|
+
from masster.chromatogram import Chromatogram
|
|
67
67
|
# Parameters removed - using hardcoded defaults
|
|
68
68
|
|
|
69
69
|
|
|
@@ -251,9 +251,7 @@ def lib_link(
|
|
|
251
251
|
|
|
252
252
|
for _index, row in self.lib.iterrows():
|
|
253
253
|
# find all features that match the mz and rt is not None
|
|
254
|
-
mask = (self.features_df["mz"] >= row["mz"] - mz_tol_lib) & (
|
|
255
|
-
self.features_df["mz"] <= row["mz"] + mz_tol_lib
|
|
256
|
-
)
|
|
254
|
+
mask = (self.features_df["mz"] >= row["mz"] - mz_tol_lib) & (self.features_df["mz"] <= row["mz"] + mz_tol_lib)
|
|
257
255
|
if row["rt"] is not None and rt_tol_lib is not np.nan:
|
|
258
256
|
mask &= (self.features_df["rt"] >= row["rt"] - rt_tol_lib) & (
|
|
259
257
|
self.features_df["rt"] <= row["rt"] + rt_tol_lib
|
|
@@ -280,12 +278,8 @@ def lib_link(
|
|
|
280
278
|
"mz": f["mz"].values[0],
|
|
281
279
|
"delta_mz": row["mz"] - f["mz"].values[0],
|
|
282
280
|
"rt": f["rt"].values[0],
|
|
283
|
-
"delta_rt": row["rt"] - f["rt"].values[0]
|
|
284
|
-
|
|
285
|
-
else None,
|
|
286
|
-
"ms2_scans": f["ms2_scans"].values[0]
|
|
287
|
-
if "ms2_scans" in self.features_df.columns
|
|
288
|
-
else None,
|
|
281
|
+
"delta_rt": row["rt"] - f["rt"].values[0] if row["rt"] is not None else None,
|
|
282
|
+
"ms2_scans": f["ms2_scans"].values[0] if "ms2_scans" in self.features_df.columns else None,
|
|
289
283
|
"eic": None,
|
|
290
284
|
}
|
|
291
285
|
lib_matches.append(new_match)
|
|
@@ -418,9 +412,7 @@ def save_lib_mgf(
|
|
|
418
412
|
desc="Export MGF",
|
|
419
413
|
):
|
|
420
414
|
# find the feature with feature_uid == matchrow["feature_uid"]
|
|
421
|
-
row = self.features_df[
|
|
422
|
-
self.features_df["feature_uid"] == matchrow["feature_uid"]
|
|
423
|
-
].iloc[0]
|
|
415
|
+
row = self.features_df[self.features_df["feature_uid"] == matchrow["feature_uid"]].iloc[0]
|
|
424
416
|
if row["ms2_scans"] is None:
|
|
425
417
|
skip = skip + 1
|
|
426
418
|
continue
|
|
@@ -546,9 +538,7 @@ def save_lib_mgf(
|
|
|
546
538
|
d = {
|
|
547
539
|
"PEPMASS": row["mz"],
|
|
548
540
|
"RTINSECONDS": row["rt"],
|
|
549
|
-
"IONMODE": "positive"
|
|
550
|
-
if matchrow["adduct"][-1] == "+"
|
|
551
|
-
else "negative",
|
|
541
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
552
542
|
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
553
543
|
"NAME": f"{matchrow['name']}",
|
|
554
544
|
"SMILES": matchrow["smiles"],
|
|
@@ -616,9 +606,7 @@ def save_lib_mgf(
|
|
|
616
606
|
d = {
|
|
617
607
|
"PEPMASS": row["mz"],
|
|
618
608
|
"RTINSECONDS": row["rt"],
|
|
619
|
-
"IONMODE": "positive"
|
|
620
|
-
if matchrow["adduct"][-1] == "+"
|
|
621
|
-
else "negative",
|
|
609
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
622
610
|
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
623
611
|
"NAME": f"{matchrow['name']}",
|
|
624
612
|
"SMILES": matchrow["smiles"],
|
|
@@ -653,18 +641,14 @@ def save_lib_mgf(
|
|
|
653
641
|
spec = spec.centroid(
|
|
654
642
|
tolerance=self.parameters["mz_tol_ms1_da"],
|
|
655
643
|
ppm=self.parameters["mz_tol_ms1_ppm"],
|
|
656
|
-
min_points=self.parameters[
|
|
657
|
-
"centroid_min_points_ms1"
|
|
658
|
-
],
|
|
644
|
+
min_points=self.parameters["centroid_min_points_ms1"],
|
|
659
645
|
algo=centroid_algo,
|
|
660
646
|
)
|
|
661
647
|
elif spec.ms_level == 2:
|
|
662
648
|
spec = spec.centroid(
|
|
663
649
|
tolerance=self.parameters["mz_tol_ms2_da"],
|
|
664
650
|
ppm=self.parameters["mz_tol_ms2_ppm"],
|
|
665
|
-
min_points=self.parameters[
|
|
666
|
-
"centroid_min_points_ms2"
|
|
667
|
-
],
|
|
651
|
+
min_points=self.parameters["centroid_min_points_ms2"],
|
|
668
652
|
algo=centroid_algo,
|
|
669
653
|
)
|
|
670
654
|
if deisotope:
|
|
@@ -699,9 +683,7 @@ def save_lib_mgf(
|
|
|
699
683
|
d = {
|
|
700
684
|
"PEPMASS": row["mz"],
|
|
701
685
|
"RTINSECONDS": row["rt"],
|
|
702
|
-
"IONMODE": "positive"
|
|
703
|
-
if matchrow["adduct"][-1] == "+"
|
|
704
|
-
else "negative",
|
|
686
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
705
687
|
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
706
688
|
"NAME": f"{matchrow['name']}",
|
|
707
689
|
"SMILES": matchrow["smiles"],
|
|
@@ -739,8 +721,7 @@ def save_lib_mgf(
|
|
|
739
721
|
kineticenergy = None
|
|
740
722
|
if mslevel > 1:
|
|
741
723
|
if (
|
|
742
|
-
"CID" in filename.upper()
|
|
743
|
-
or "ZTS" in filename.upper()
|
|
724
|
+
"CID" in filename.upper() or "ZTS" in filename.upper()
|
|
744
725
|
) and "EAD" in filename.upper():
|
|
745
726
|
activation = "CID-EAD"
|
|
746
727
|
match = re.search(r"(\d+)KE", filename.upper())
|
|
@@ -752,17 +733,13 @@ def save_lib_mgf(
|
|
|
752
733
|
kineticenergy = int(match.group(1))
|
|
753
734
|
else:
|
|
754
735
|
activation = "CID"
|
|
755
|
-
energy = (
|
|
756
|
-
spec.energy if hasattr(spec, "energy") else None
|
|
757
|
-
)
|
|
736
|
+
energy = spec.energy if hasattr(spec, "energy") else None
|
|
758
737
|
|
|
759
738
|
spec = filter_peaks(spec, inty_min=inty_min)
|
|
760
739
|
d = {
|
|
761
740
|
"PEPMASS": row["mz"],
|
|
762
741
|
"RTINSECONDS": row["rt"],
|
|
763
|
-
"IONMODE": "positive"
|
|
764
|
-
if matchrow["adduct"][-1] == "+"
|
|
765
|
-
else "negative",
|
|
742
|
+
"IONMODE": "positive" if matchrow["adduct"][-1] == "+" else "negative",
|
|
766
743
|
"CHARGE": "1" + matchrow["adduct"].split("]")[1],
|
|
767
744
|
"NAME": f"{matchrow['name']}",
|
|
768
745
|
"SMILES": matchrow["smiles"],
|
|
@@ -775,9 +752,7 @@ def save_lib_mgf(
|
|
|
775
752
|
"FILENAME": filename,
|
|
776
753
|
"SCANS": ms1_scan_uid,
|
|
777
754
|
"FID": row["fid"],
|
|
778
|
-
"MSLEVEL": 1
|
|
779
|
-
if spec.ms_level is None
|
|
780
|
-
else spec.ms_level,
|
|
755
|
+
"MSLEVEL": 1 if spec.ms_level is None else spec.ms_level,
|
|
781
756
|
}
|
|
782
757
|
write_ion(f, d, spec)
|
|
783
758
|
|