masster 0.5.27__py3-none-any.whl → 0.6.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/data/libs/aa_nort.json +240 -0
- masster/data/libs/ccm_nort.json +1319 -0
- masster/lib/lib.py +1 -1
- masster/logger.py +0 -6
- masster/sample/adducts.py +1 -1
- masster/sample/defaults/find_adducts_def.py +1 -1
- masster/sample/h5.py +152 -2
- masster/sample/helpers.py +91 -5
- masster/sample/id.py +1160 -0
- masster/sample/importers.py +316 -0
- masster/sample/plot.py +175 -71
- masster/sample/sample.py +18 -3
- masster/sample/sample5_schema.json +99 -1
- masster/study/defaults/study_def.py +8 -12
- masster/study/id.py +59 -12
- masster/study/load.py +0 -11
- masster/study/merge.py +153 -0
- masster/study/plot.py +197 -0
- masster/study/study.py +3 -1
- masster/study/study5_schema.json +15 -0
- masster/wizard/wizard.py +11 -12
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/METADATA +99 -60
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/RECORD +27 -26
- masster/data/libs/aa.csv +0 -22
- masster/data/libs/ccm.csv +0 -120
- masster/data/libs/urine.csv +0 -4693
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/WHEEL +0 -0
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/entry_points.txt +0 -0
- {masster-0.5.27.dist-info → masster-0.6.0.dist-info}/licenses/LICENSE +0 -0
masster/sample/sample.py
CHANGED
|
@@ -129,6 +129,12 @@ from masster.sample.helpers import get_eic
|
|
|
129
129
|
from masster.sample.helpers import set_source
|
|
130
130
|
from masster.sample.helpers import _recreate_feature_map
|
|
131
131
|
from masster.sample.helpers import _get_feature_map
|
|
132
|
+
from masster.sample.id import lib_load
|
|
133
|
+
from masster.sample.id import identify
|
|
134
|
+
from masster.sample.id import get_id
|
|
135
|
+
from masster.sample.id import id_reset
|
|
136
|
+
from masster.sample.id import lib_reset
|
|
137
|
+
from masster.sample.importers import import_oracle
|
|
132
138
|
from masster.sample.load import chrom_extract
|
|
133
139
|
from masster.sample.load import _index_file
|
|
134
140
|
from masster.sample.load import load
|
|
@@ -259,9 +265,10 @@ class Sample:
|
|
|
259
265
|
# the polars data frame with MS1 level data
|
|
260
266
|
self.ms1_df = pl.DataFrame()
|
|
261
267
|
|
|
262
|
-
#
|
|
263
|
-
self.
|
|
264
|
-
self.
|
|
268
|
+
# identification DataFrames (lib_df and id_df)
|
|
269
|
+
self.lib_df = None # library DataFrame (from masster.lib or CSV/JSON)
|
|
270
|
+
self.id_df = None # identification results DataFrame
|
|
271
|
+
self._lib = None # reference to Lib object if loaded
|
|
265
272
|
self.chrom_df = None
|
|
266
273
|
|
|
267
274
|
if params.filename is not None:
|
|
@@ -292,6 +299,14 @@ class Sample:
|
|
|
292
299
|
update_parameters = update_parameters
|
|
293
300
|
get_parameters_property = get_parameters_property
|
|
294
301
|
set_parameters_property = set_parameters_property
|
|
302
|
+
# Identification methods from id.py
|
|
303
|
+
lib_load = lib_load
|
|
304
|
+
identify = identify
|
|
305
|
+
get_id = get_id
|
|
306
|
+
id_reset = id_reset
|
|
307
|
+
lib_reset = lib_reset
|
|
308
|
+
# Importers from importers.py
|
|
309
|
+
import_oracle = import_oracle
|
|
295
310
|
export_features = export_features
|
|
296
311
|
export_xlsx = export_xlsx
|
|
297
312
|
export_mgf = export_mgf
|
|
@@ -93,10 +93,108 @@
|
|
|
93
93
|
},
|
|
94
94
|
"ms1_spec": {
|
|
95
95
|
"dtype": "pl.Object"
|
|
96
|
+
},
|
|
97
|
+
"id_top_name": {
|
|
98
|
+
"dtype": "pl.Utf8"
|
|
99
|
+
},
|
|
100
|
+
"id_top_class": {
|
|
101
|
+
"dtype": "pl.Utf8"
|
|
102
|
+
},
|
|
103
|
+
"id_top_adduct": {
|
|
104
|
+
"dtype": "pl.Utf8"
|
|
105
|
+
},
|
|
106
|
+
"id_top_score": {
|
|
107
|
+
"dtype": "pl.Float64"
|
|
108
|
+
},
|
|
109
|
+
"id_source": {
|
|
110
|
+
"dtype": "pl.Utf8"
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
},
|
|
114
|
+
"lib_df": {
|
|
115
|
+
"columns": {
|
|
116
|
+
"lib_uid": {
|
|
117
|
+
"dtype": "pl.Int64"
|
|
118
|
+
},
|
|
119
|
+
"cmpd_uid": {
|
|
120
|
+
"dtype": "pl.Int64"
|
|
121
|
+
},
|
|
122
|
+
"name": {
|
|
123
|
+
"dtype": "pl.Utf8"
|
|
124
|
+
},
|
|
125
|
+
"shortname": {
|
|
126
|
+
"dtype": "pl.Utf8"
|
|
127
|
+
},
|
|
128
|
+
"class": {
|
|
129
|
+
"dtype": "pl.Utf8"
|
|
130
|
+
},
|
|
131
|
+
"formula": {
|
|
132
|
+
"dtype": "pl.Utf8"
|
|
133
|
+
},
|
|
134
|
+
"iso": {
|
|
135
|
+
"dtype": "pl.Int64"
|
|
136
|
+
},
|
|
137
|
+
"smiles": {
|
|
138
|
+
"dtype": "pl.Utf8"
|
|
139
|
+
},
|
|
140
|
+
"inchi": {
|
|
141
|
+
"dtype": "pl.Utf8"
|
|
142
|
+
},
|
|
143
|
+
"inchikey": {
|
|
144
|
+
"dtype": "pl.Utf8"
|
|
145
|
+
},
|
|
146
|
+
"adduct": {
|
|
147
|
+
"dtype": "pl.Utf8"
|
|
148
|
+
},
|
|
149
|
+
"z": {
|
|
150
|
+
"dtype": "pl.Int64"
|
|
151
|
+
},
|
|
152
|
+
"m": {
|
|
153
|
+
"dtype": "pl.Float64"
|
|
154
|
+
},
|
|
155
|
+
"mz": {
|
|
156
|
+
"dtype": "pl.Float64"
|
|
157
|
+
},
|
|
158
|
+
"rt": {
|
|
159
|
+
"dtype": "pl.Float64"
|
|
160
|
+
},
|
|
161
|
+
"quant_group": {
|
|
162
|
+
"dtype": "pl.Int64"
|
|
163
|
+
},
|
|
164
|
+
"probability": {
|
|
165
|
+
"dtype": "pl.Float64"
|
|
166
|
+
},
|
|
167
|
+
"source_id": {
|
|
168
|
+
"dtype": "pl.Utf8"
|
|
169
|
+
}
|
|
170
|
+
}
|
|
171
|
+
},
|
|
172
|
+
"id_df": {
|
|
173
|
+
"columns": {
|
|
174
|
+
"feature_uid": {
|
|
175
|
+
"dtype": "pl.Int64"
|
|
176
|
+
},
|
|
177
|
+
"lib_uid": {
|
|
178
|
+
"dtype": "pl.Int64"
|
|
179
|
+
},
|
|
180
|
+
"mz_delta": {
|
|
181
|
+
"dtype": "pl.Float64"
|
|
182
|
+
},
|
|
183
|
+
"rt_delta": {
|
|
184
|
+
"dtype": "pl.Float64"
|
|
185
|
+
},
|
|
186
|
+
"matcher": {
|
|
187
|
+
"dtype": "pl.Utf8"
|
|
188
|
+
},
|
|
189
|
+
"score": {
|
|
190
|
+
"dtype": "pl.Float64"
|
|
191
|
+
},
|
|
192
|
+
"iso": {
|
|
193
|
+
"dtype": "pl.Int64"
|
|
96
194
|
}
|
|
97
195
|
}
|
|
98
196
|
},
|
|
99
|
-
"generated_date": "2025-
|
|
197
|
+
"generated_date": "2025-10-30",
|
|
100
198
|
"ms1_df": {
|
|
101
199
|
"columns": {
|
|
102
200
|
"cycle": {
|
|
@@ -96,19 +96,15 @@ class study_defaults:
|
|
|
96
96
|
"adducts": {
|
|
97
97
|
"dtype": "list[str]",
|
|
98
98
|
"description": "List of adduct specifications in OpenMS format (element:charge:probability). Charged adduct probabilities must sum to 1.0.",
|
|
99
|
-
"default": ["H
|
|
99
|
+
"default": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05"],
|
|
100
100
|
"examples": {
|
|
101
|
-
"positive": ["H
|
|
102
|
-
"negative": [
|
|
103
|
-
"H-1:-:0.95",
|
|
104
|
-
"Cl:-:0.05",
|
|
105
|
-
"CH2O2:0:0.2",
|
|
106
|
-
"H-2-O:0:0.2",
|
|
107
|
-
],
|
|
101
|
+
"positive": ["+H:1:0.65", "+Na:1:0.15", "+NH4:1:0.15", "+K:1:0.05", "-H2O:0:0.15"],
|
|
102
|
+
"negative": ["-H:-1:0.95", "+Cl:-1:0.05", "+CH2O2:0:0.2", "-H2O:0:0.2"],
|
|
108
103
|
},
|
|
109
104
|
"validation_rules": [
|
|
110
|
-
"Format:
|
|
111
|
-
"
|
|
105
|
+
"Format: formula:charge:probability (e.g., '+H:1:0.65', '-H:-1:0.95', '-H2O:0:0.15')",
|
|
106
|
+
"Formula must start with + or - to indicate gain/loss (e.g., '+H', '-H', '+Na', '-H2O')",
|
|
107
|
+
"Charge must be an integer (positive, negative, or 0 for neutral)",
|
|
112
108
|
"Probability must be between 0.0 and 1.0",
|
|
113
109
|
"Sum of all charged adduct probabilities must equal 1.0",
|
|
114
110
|
],
|
|
@@ -128,7 +124,7 @@ class study_defaults:
|
|
|
128
124
|
"""Set polarity-specific defaults for adducts if not explicitly provided."""
|
|
129
125
|
# If adducts is None, set based on polarity
|
|
130
126
|
if self.adducts is None:
|
|
131
|
-
if self.polarity.lower() in ["positive", "pos"]:
|
|
127
|
+
if self.polarity.lower() in ["positive", "pos", "+"]:
|
|
132
128
|
self.adducts = [
|
|
133
129
|
"+H:1:0.65",
|
|
134
130
|
"+Na:1:0.15",
|
|
@@ -136,7 +132,7 @@ class study_defaults:
|
|
|
136
132
|
"+K:1:0.05",
|
|
137
133
|
"-H2O:0:0.15",
|
|
138
134
|
]
|
|
139
|
-
elif self.polarity.lower() in ["negative", "neg"]:
|
|
135
|
+
elif self.polarity.lower() in ["negative", "neg", "-"]:
|
|
140
136
|
self.adducts = [
|
|
141
137
|
"-H:-1:0.9",
|
|
142
138
|
"+Cl:-1:0.1",
|
masster/study/id.py
CHANGED
|
@@ -24,7 +24,8 @@ def lib_load(
|
|
|
24
24
|
lib_source: either a CSV/JSON file path (str) or a Lib instance
|
|
25
25
|
polarity: ionization polarity ("positive" or "negative") - used when lib_source is a CSV/JSON path.
|
|
26
26
|
If None, uses study.polarity automatically.
|
|
27
|
-
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path
|
|
27
|
+
adducts: specific adducts to generate - used when lib_source is a CSV/JSON path.
|
|
28
|
+
If None, uses study.parameters.adducts if available.
|
|
28
29
|
iso: isotope generation mode ("13C" to generate 13C isotopes, None for no isotopes)
|
|
29
30
|
|
|
30
31
|
Side effects:
|
|
@@ -51,6 +52,18 @@ def lib_load(
|
|
|
51
52
|
else:
|
|
52
53
|
polarity = "positive" # Default fallback
|
|
53
54
|
study.logger.debug(f"Using study polarity: {polarity}")
|
|
55
|
+
|
|
56
|
+
# Use study.parameters.adducts if adducts not explicitly provided
|
|
57
|
+
# If study.parameters.adducts is also None, lib will use its default adducts for the polarity
|
|
58
|
+
if adducts is None:
|
|
59
|
+
if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
|
|
60
|
+
adducts = study.parameters.adducts
|
|
61
|
+
if adducts:
|
|
62
|
+
study.logger.debug(f"Using study.parameters.adducts: {adducts}")
|
|
63
|
+
else:
|
|
64
|
+
study.logger.debug(f"study.parameters.adducts is None, lib will use default adducts for {polarity} mode")
|
|
65
|
+
else:
|
|
66
|
+
study.logger.debug(f"study.parameters.adducts not found, lib will use default adducts for {polarity} mode")
|
|
54
67
|
|
|
55
68
|
# Handle string input (CSV or JSON file path)
|
|
56
69
|
if isinstance(lib_source, str):
|
|
@@ -403,42 +416,64 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
403
416
|
"""
|
|
404
417
|
Find library matches using optimized vectorized operations.
|
|
405
418
|
|
|
406
|
-
|
|
419
|
+
Automatically skips RT filtering if library has no RT data for the matched entries.
|
|
407
420
|
"""
|
|
408
421
|
# Filter by m/z tolerance using vectorized operations
|
|
409
422
|
matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
|
|
410
423
|
|
|
411
424
|
initial_match_count = len(matches)
|
|
412
425
|
|
|
413
|
-
# Apply RT filter if
|
|
426
|
+
# Apply RT filter if requested AND if data is available
|
|
427
|
+
# Strategy: Handle mixed RT/no-RT entries properly by treating them separately
|
|
414
428
|
if rt_tol is not None and cons_rt is not None and not matches.is_empty():
|
|
415
|
-
#
|
|
429
|
+
# Separate entries with and without RT data
|
|
416
430
|
rt_candidates = matches.filter(pl.col("rt").is_not_null())
|
|
431
|
+
no_rt_entries = matches.filter(pl.col("rt").is_null())
|
|
417
432
|
|
|
418
433
|
if not rt_candidates.is_empty():
|
|
419
434
|
# Apply RT filtering to candidates with RT data
|
|
420
435
|
rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
|
|
421
436
|
|
|
422
|
-
|
|
437
|
+
# Combine RT-filtered matches with entries that have no RT data
|
|
438
|
+
# Rationale: Entries without RT can't be filtered by RT, so include them
|
|
439
|
+
if not rt_matches.is_empty() and not no_rt_entries.is_empty():
|
|
440
|
+
# Both RT matches and no-RT entries exist
|
|
441
|
+
matches = pl.concat([rt_matches, no_rt_entries])
|
|
442
|
+
if logger:
|
|
443
|
+
logger.debug(
|
|
444
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
445
|
+
f"{len(rt_matches)} passed RT filter, {len(no_rt_entries)} with no RT → {len(matches)} total matches"
|
|
446
|
+
)
|
|
447
|
+
elif not rt_matches.is_empty():
|
|
448
|
+
# Only RT matches, no entries without RT
|
|
423
449
|
matches = rt_matches
|
|
424
450
|
if logger:
|
|
425
451
|
logger.debug(
|
|
426
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT,
|
|
452
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT, "
|
|
453
|
+
f"{len(matches)} passed RT filter"
|
|
454
|
+
)
|
|
455
|
+
elif not no_rt_entries.is_empty():
|
|
456
|
+
# No RT matches passed filter, but there are entries without RT
|
|
457
|
+
matches = no_rt_entries
|
|
458
|
+
if logger:
|
|
459
|
+
logger.debug(
|
|
460
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, {len(rt_candidates)} with RT but none passed RT filter, "
|
|
461
|
+
f"using {len(matches)} entries with no RT data"
|
|
427
462
|
)
|
|
428
463
|
else:
|
|
429
|
-
#
|
|
430
|
-
matches =
|
|
464
|
+
# No RT matches and no entries without RT - return empty
|
|
465
|
+
matches = pl.DataFrame()
|
|
431
466
|
if logger:
|
|
432
467
|
logger.debug(
|
|
433
468
|
f"Consensus {cons_uid}: RT filtering eliminated all {len(rt_candidates)} candidates (rt_tol={rt_tol}s) - no matches returned"
|
|
434
469
|
)
|
|
435
470
|
else:
|
|
436
|
-
#
|
|
471
|
+
# All m/z matches have no RT data - keep all m/z matches
|
|
437
472
|
if logger:
|
|
438
473
|
logger.debug(
|
|
439
|
-
f"Consensus {cons_uid}: {initial_match_count} m/z matches
|
|
474
|
+
f"Consensus {cons_uid}: {initial_match_count} m/z matches, all have no RT data - using m/z matches only"
|
|
440
475
|
)
|
|
441
|
-
matches
|
|
476
|
+
# matches already contains the m/z-filtered results (which are all no_rt_entries)
|
|
442
477
|
|
|
443
478
|
# FIX 1: Add stricter m/z validation - prioritize more accurate matches
|
|
444
479
|
if not matches.is_empty():
|
|
@@ -884,6 +919,18 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
884
919
|
effective_mz_tol = getattr(params, "mz_tol", 0.01)
|
|
885
920
|
effective_rt_tol = getattr(params, "rt_tol", 2.0)
|
|
886
921
|
|
|
922
|
+
# Check if library has RT data - if not, disable RT filtering
|
|
923
|
+
if effective_rt_tol is not None and hasattr(study, "lib_df") and study.lib_df is not None:
|
|
924
|
+
if "rt" in study.lib_df.columns:
|
|
925
|
+
# Check if library has any non-null RT values
|
|
926
|
+
rt_count = study.lib_df.filter(pl.col("rt").is_not_null()).shape[0]
|
|
927
|
+
if rt_count == 0:
|
|
928
|
+
if logger:
|
|
929
|
+
logger.info(
|
|
930
|
+
f"Library has no retention time data - disabling RT filtering (was rt_tol={effective_rt_tol})"
|
|
931
|
+
)
|
|
932
|
+
effective_rt_tol = None
|
|
933
|
+
|
|
887
934
|
if logger:
|
|
888
935
|
logger.debug(
|
|
889
936
|
f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
|
|
@@ -1483,7 +1530,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
|
1483
1530
|
if charge_min <= abs(total_charge) <= charge_max and total_charge != 0:
|
|
1484
1531
|
components = [spec] * multiplier
|
|
1485
1532
|
formatted_name = _format_adduct_name(components)
|
|
1486
|
-
probability_multiplied = float(spec["probability"]) ** multiplier
|
|
1533
|
+
probability_multiplied = (float(spec["probability"]) ** multiplier) / 2.0
|
|
1487
1534
|
|
|
1488
1535
|
combinations_list.append(
|
|
1489
1536
|
{
|
masster/study/load.py
CHANGED
|
@@ -191,17 +191,6 @@ def load(self, filename=None):
|
|
|
191
191
|
|
|
192
192
|
_load_study5(self, filename)
|
|
193
193
|
|
|
194
|
-
# After loading the study, check if we have consensus features before loading consensus XML
|
|
195
|
-
# if (self.consensus_df is not None and not self.consensus_df.is_empty()):
|
|
196
|
-
# consensus_xml_path = filename.replace(".study5", ".consensusXML")
|
|
197
|
-
# if os.path.exists(consensus_xml_path):
|
|
198
|
-
# self._load_consensusXML(filename=consensus_xml_path)
|
|
199
|
-
# self.logger.info(f"Automatically loaded consensus from {consensus_xml_path}")
|
|
200
|
-
# else:
|
|
201
|
-
# self.logger.warning(f"No consensus XML file found at {consensus_xml_path}")
|
|
202
|
-
# else:
|
|
203
|
-
# self.logger.debug("No consensus features found, skipping consensusXML loading")
|
|
204
|
-
|
|
205
194
|
self.filename = filename
|
|
206
195
|
|
|
207
196
|
|
masster/study/merge.py
CHANGED
|
@@ -441,9 +441,15 @@ def merge(study, **kwargs) -> None:
|
|
|
441
441
|
cached_valid_adducts = None
|
|
442
442
|
try:
|
|
443
443
|
cached_adducts_df = study._get_adducts()
|
|
444
|
+
# Remove all adducts with wrong polarity
|
|
445
|
+
if study.polarity == "positive":
|
|
446
|
+
cached_adducts_df = cached_adducts_df.filter(pl.col("charge") >= 0)
|
|
447
|
+
else:
|
|
448
|
+
cached_adducts_df = cached_adducts_df.filter(pl.col("charge") <= 0)
|
|
444
449
|
if not cached_adducts_df.is_empty():
|
|
445
450
|
cached_valid_adducts = set(cached_adducts_df["name"].to_list())
|
|
446
451
|
else:
|
|
452
|
+
study.logger.warning(f"No valid adducts found for polarity '{study.polarity}'")
|
|
447
453
|
cached_valid_adducts = set()
|
|
448
454
|
except Exception as e:
|
|
449
455
|
study.logger.warning(f"Could not retrieve study adducts: {e}")
|
|
@@ -452,6 +458,13 @@ def merge(study, **kwargs) -> None:
|
|
|
452
458
|
# Always allow '?' adducts
|
|
453
459
|
cached_valid_adducts.add("?")
|
|
454
460
|
|
|
461
|
+
# Bypass for single sample case
|
|
462
|
+
if len(study.samples_df) == 1:
|
|
463
|
+
study.logger.info("Single sample detected - bypassing merge algorithm and using direct feature mapping")
|
|
464
|
+
_handle_single_sample_merge(study, cached_adducts_df, cached_valid_adducts)
|
|
465
|
+
# Skip all post-processing for single sample case
|
|
466
|
+
return
|
|
467
|
+
|
|
455
468
|
# Route to algorithm implementation
|
|
456
469
|
if params.method == "kd":
|
|
457
470
|
consensus_map = _merge_kd(study, params)
|
|
@@ -1719,6 +1732,10 @@ def _calculate_consensus_statistics(
|
|
|
1719
1732
|
mz_values: m/z values from chunk consensus features
|
|
1720
1733
|
intensity_values: Intensity values from chunk consensus features
|
|
1721
1734
|
quality_values: Quality values from chunk consensus features
|
|
1735
|
+
number_features: Number of unique features contributing
|
|
1736
|
+
number_samples: Number of unique samples contributing
|
|
1737
|
+
cached_adducts_df: Cached DataFrame of valid adducts for the study
|
|
1738
|
+
cached_valid_adducts: Cached set of valid adduct names for the study
|
|
1722
1739
|
|
|
1723
1740
|
Returns:
|
|
1724
1741
|
Dictionary with consensus feature metadata
|
|
@@ -3612,6 +3629,142 @@ def __merge_adduct_grouping(study, consensus_data, rt_tol, mz_tol):
|
|
|
3612
3629
|
return adduct_group_list, adduct_of_list
|
|
3613
3630
|
|
|
3614
3631
|
|
|
3632
|
+
def _handle_single_sample_merge(study, cached_adducts_df=None, cached_valid_adducts=None):
|
|
3633
|
+
"""
|
|
3634
|
+
Handle merge for the special case of a single sample.
|
|
3635
|
+
Directly populate consensus_df from the sample's features_df without any filtering.
|
|
3636
|
+
|
|
3637
|
+
Args:
|
|
3638
|
+
study: Study object with single sample
|
|
3639
|
+
cached_adducts_df: Pre-computed adducts DataFrame (optional)
|
|
3640
|
+
cached_valid_adducts: Set of valid adduct names (optional)
|
|
3641
|
+
"""
|
|
3642
|
+
import polars as pl
|
|
3643
|
+
import uuid
|
|
3644
|
+
|
|
3645
|
+
if len(study.samples_df) != 1:
|
|
3646
|
+
raise ValueError("_handle_single_sample_merge should only be called with exactly one sample")
|
|
3647
|
+
|
|
3648
|
+
# Get the single sample's features
|
|
3649
|
+
sample_row = study.samples_df.row(0, named=True)
|
|
3650
|
+
sample_uid = sample_row["sample_uid"]
|
|
3651
|
+
|
|
3652
|
+
# Filter features for this sample
|
|
3653
|
+
sample_features = study.features_df.filter(pl.col("sample_uid") == sample_uid)
|
|
3654
|
+
|
|
3655
|
+
if len(sample_features) == 0:
|
|
3656
|
+
study.logger.warning("No features found for single sample")
|
|
3657
|
+
study.consensus_df = pl.DataFrame()
|
|
3658
|
+
study.consensus_mapping_df = pl.DataFrame()
|
|
3659
|
+
return
|
|
3660
|
+
|
|
3661
|
+
study.logger.info(f"Creating consensus from {len(sample_features)} features in single sample")
|
|
3662
|
+
|
|
3663
|
+
# Create consensus features directly from sample features
|
|
3664
|
+
consensus_list = []
|
|
3665
|
+
mapping_list = []
|
|
3666
|
+
|
|
3667
|
+
# Cache valid adducts
|
|
3668
|
+
valid_adducts = cached_valid_adducts if cached_valid_adducts is not None else set()
|
|
3669
|
+
valid_adducts.add("?") # Always allow '?' adducts
|
|
3670
|
+
|
|
3671
|
+
for i, feature_row in enumerate(sample_features.iter_rows(named=True)):
|
|
3672
|
+
# Generate unique consensus ID
|
|
3673
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
3674
|
+
|
|
3675
|
+
# Handle adduct information
|
|
3676
|
+
adduct = feature_row.get("adduct")
|
|
3677
|
+
if adduct is None or adduct not in valid_adducts:
|
|
3678
|
+
# Set default adduct based on study polarity
|
|
3679
|
+
study_polarity = getattr(study, "polarity", "positive")
|
|
3680
|
+
if study_polarity in ["negative", "neg"]:
|
|
3681
|
+
adduct = "[M-?]1-"
|
|
3682
|
+
adduct_charge = -1
|
|
3683
|
+
adduct_mass_shift = -1.007825
|
|
3684
|
+
else:
|
|
3685
|
+
adduct = "[M+?]1+"
|
|
3686
|
+
adduct_charge = 1
|
|
3687
|
+
adduct_mass_shift = 1.007825
|
|
3688
|
+
else:
|
|
3689
|
+
# Try to get charge and mass shift from cached adducts
|
|
3690
|
+
adduct_charge = 1
|
|
3691
|
+
adduct_mass_shift = 1.007825
|
|
3692
|
+
if cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
3693
|
+
matching_adduct = cached_adducts_df.filter(pl.col("name") == adduct)
|
|
3694
|
+
if not matching_adduct.is_empty():
|
|
3695
|
+
adduct_row = matching_adduct.row(0, named=True)
|
|
3696
|
+
adduct_charge = adduct_row["charge"]
|
|
3697
|
+
adduct_mass_shift = adduct_row["mass_shift"]
|
|
3698
|
+
|
|
3699
|
+
# Calculate neutral mass
|
|
3700
|
+
mz = feature_row.get("mz", 0.0)
|
|
3701
|
+
if adduct_charge and adduct_mass_shift is not None:
|
|
3702
|
+
adduct_mass_neutral = mz * abs(adduct_charge) - adduct_mass_shift
|
|
3703
|
+
else:
|
|
3704
|
+
adduct_mass_neutral = None
|
|
3705
|
+
|
|
3706
|
+
# Count MS2 scans
|
|
3707
|
+
ms2_scans = feature_row.get("ms2_scans", [])
|
|
3708
|
+
ms2_count = len(ms2_scans) if ms2_scans else 0
|
|
3709
|
+
|
|
3710
|
+
# Create consensus feature metadata
|
|
3711
|
+
consensus_feature = {
|
|
3712
|
+
"consensus_uid": i,
|
|
3713
|
+
"consensus_id": consensus_id_str,
|
|
3714
|
+
"quality": feature_row.get("quality", 1.0),
|
|
3715
|
+
"number_samples": 1, # Always 1 for single sample
|
|
3716
|
+
"rt": feature_row.get("rt", 0.0),
|
|
3717
|
+
"mz": mz,
|
|
3718
|
+
"rt_min": feature_row.get("rt", 0.0),
|
|
3719
|
+
"rt_max": feature_row.get("rt", 0.0),
|
|
3720
|
+
"rt_mean": feature_row.get("rt", 0.0),
|
|
3721
|
+
"rt_start_mean": feature_row.get("rt_start", 0.0),
|
|
3722
|
+
"rt_end_mean": feature_row.get("rt_end", 0.0),
|
|
3723
|
+
"rt_delta_mean": feature_row.get("rt_delta", 0.0),
|
|
3724
|
+
"mz_min": mz,
|
|
3725
|
+
"mz_max": mz,
|
|
3726
|
+
"mz_mean": mz,
|
|
3727
|
+
"mz_start_mean": feature_row.get("mz_start", 0.0),
|
|
3728
|
+
"mz_end_mean": feature_row.get("mz_end", 0.0),
|
|
3729
|
+
"inty_mean": feature_row.get("inty", 0.0),
|
|
3730
|
+
"bl": -1.0,
|
|
3731
|
+
"chrom_coherence_mean": feature_row.get("chrom_coherence", 0.0),
|
|
3732
|
+
"chrom_prominence_mean": feature_row.get("chrom_prominence", 0.0),
|
|
3733
|
+
"chrom_prominence_scaled_mean": feature_row.get("chrom_prominence_scaled", 0.0),
|
|
3734
|
+
"chrom_height_scaled_mean": feature_row.get("chrom_height_scaled", 0.0),
|
|
3735
|
+
"iso": None, # Will be filled by find_iso() function
|
|
3736
|
+
"iso_mean": feature_row.get("iso", 0.0),
|
|
3737
|
+
"charge_mean": feature_row.get("charge", 0.0),
|
|
3738
|
+
"number_ms2": ms2_count,
|
|
3739
|
+
"adducts": [[adduct, 1, 100.0]], # Single adduct with 100% frequency
|
|
3740
|
+
"adduct_top": adduct,
|
|
3741
|
+
"adduct_charge_top": adduct_charge,
|
|
3742
|
+
"adduct_mass_neutral_top": adduct_mass_neutral,
|
|
3743
|
+
"adduct_mass_shift_top": adduct_mass_shift,
|
|
3744
|
+
"id_top_name": None,
|
|
3745
|
+
"id_top_class": None,
|
|
3746
|
+
"id_top_adduct": None,
|
|
3747
|
+
"id_top_score": None,
|
|
3748
|
+
"id_source": None,
|
|
3749
|
+
}
|
|
3750
|
+
|
|
3751
|
+
consensus_list.append(consensus_feature)
|
|
3752
|
+
|
|
3753
|
+
# Create mapping entry
|
|
3754
|
+
mapping_entry = {
|
|
3755
|
+
"consensus_uid": i,
|
|
3756
|
+
"sample_uid": sample_uid,
|
|
3757
|
+
"feature_uid": feature_row.get("feature_uid"),
|
|
3758
|
+
}
|
|
3759
|
+
mapping_list.append(mapping_entry)
|
|
3760
|
+
|
|
3761
|
+
# Create DataFrames
|
|
3762
|
+
study.consensus_df = pl.DataFrame(consensus_list, strict=False)
|
|
3763
|
+
study.consensus_mapping_df = pl.DataFrame(mapping_list, strict=False)
|
|
3764
|
+
|
|
3765
|
+
study.logger.info(f"Created {len(consensus_list)} consensus features from single sample")
|
|
3766
|
+
|
|
3767
|
+
|
|
3615
3768
|
def _fast_correlation(x, y):
|
|
3616
3769
|
"""
|
|
3617
3770
|
Fast correlation coefficient calculation for consensus matrix data.
|