masster 0.5.13__py3-none-any.whl → 0.5.15__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/lib/lib.py +371 -57
- masster/sample/adducts.py +8 -5
- masster/sample/processing.py +6 -0
- masster/study/helpers.py +1 -0
- masster/study/id.py +237 -39
- masster/study/importers.py +161 -52
- masster/study/merge.py +1 -1
- masster/study/plot.py +10 -18
- masster/study/study5_schema.json +9 -0
- masster/wizard/__init__.py +4 -4
- masster/wizard/wizard.py +919 -108
- {masster-0.5.13.dist-info → masster-0.5.15.dist-info}/METADATA +1 -1
- {masster-0.5.13.dist-info → masster-0.5.15.dist-info}/RECORD +17 -17
- {masster-0.5.13.dist-info → masster-0.5.15.dist-info}/WHEEL +0 -0
- {masster-0.5.13.dist-info → masster-0.5.15.dist-info}/entry_points.txt +0 -0
- {masster-0.5.13.dist-info → masster-0.5.15.dist-info}/licenses/LICENSE +0 -0
masster/_version.py
CHANGED
masster/lib/lib.py
CHANGED
|
@@ -46,11 +46,9 @@ annotations = lib.annotate_features(sample.features_df)
|
|
|
46
46
|
"""
|
|
47
47
|
|
|
48
48
|
import os
|
|
49
|
-
import uuid
|
|
50
49
|
from typing import Optional, Union, List, Dict, Any, TYPE_CHECKING
|
|
51
50
|
import warnings
|
|
52
51
|
|
|
53
|
-
import numpy as np
|
|
54
52
|
import polars as pl
|
|
55
53
|
import pyopenms as oms
|
|
56
54
|
|
|
@@ -58,6 +56,148 @@ if TYPE_CHECKING:
|
|
|
58
56
|
import pandas as pd
|
|
59
57
|
|
|
60
58
|
|
|
59
|
+
def _calculate_formula_mass_shift(formula: str) -> float:
|
|
60
|
+
"""
|
|
61
|
+
Calculate mass shift from formula string like "+H", "-H2O", "+Na-H", etc.
|
|
62
|
+
|
|
63
|
+
Parameters
|
|
64
|
+
----------
|
|
65
|
+
formula : str
|
|
66
|
+
Formula string (e.g., "+H", "-H2O", "+Na-H")
|
|
67
|
+
|
|
68
|
+
Returns
|
|
69
|
+
-------
|
|
70
|
+
float
|
|
71
|
+
Mass shift in Daltons
|
|
72
|
+
"""
|
|
73
|
+
# Standard atomic masses
|
|
74
|
+
atomic_masses = {
|
|
75
|
+
"H": 1.007825,
|
|
76
|
+
"C": 12.0,
|
|
77
|
+
"N": 14.003074,
|
|
78
|
+
"O": 15.994915,
|
|
79
|
+
"Na": 22.989769,
|
|
80
|
+
"K": 38.963707,
|
|
81
|
+
"Li": 7.016003,
|
|
82
|
+
"Ca": 39.962591,
|
|
83
|
+
"Mg": 23.985042,
|
|
84
|
+
"Fe": 55.934938,
|
|
85
|
+
"Cl": 34.968853,
|
|
86
|
+
"Br": 78.918336,
|
|
87
|
+
"I": 126.904473,
|
|
88
|
+
"P": 30.973762,
|
|
89
|
+
"S": 31.972071,
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
total_mass = 0.0
|
|
93
|
+
|
|
94
|
+
# Parse formula by splitting on + and - while preserving the operators
|
|
95
|
+
parts = []
|
|
96
|
+
current_part = ""
|
|
97
|
+
current_sign = 1
|
|
98
|
+
|
|
99
|
+
for char in formula:
|
|
100
|
+
if char == "+":
|
|
101
|
+
if current_part:
|
|
102
|
+
parts.append((current_sign, current_part))
|
|
103
|
+
current_part = ""
|
|
104
|
+
current_sign = 1
|
|
105
|
+
elif char == "-":
|
|
106
|
+
if current_part:
|
|
107
|
+
parts.append((current_sign, current_part))
|
|
108
|
+
current_part = ""
|
|
109
|
+
current_sign = -1
|
|
110
|
+
else:
|
|
111
|
+
current_part += char
|
|
112
|
+
|
|
113
|
+
if current_part:
|
|
114
|
+
parts.append((current_sign, current_part))
|
|
115
|
+
|
|
116
|
+
# Process each part
|
|
117
|
+
for sign, part in parts:
|
|
118
|
+
if not part:
|
|
119
|
+
continue
|
|
120
|
+
|
|
121
|
+
# Parse element and count (e.g., "H2O" -> H:2, O:1)
|
|
122
|
+
elements = _parse_element_counts(part)
|
|
123
|
+
|
|
124
|
+
for element, count in elements.items():
|
|
125
|
+
if element in atomic_masses:
|
|
126
|
+
total_mass += sign * atomic_masses[element] * count
|
|
127
|
+
|
|
128
|
+
return total_mass
|
|
129
|
+
|
|
130
|
+
|
|
131
|
+
def _parse_element_counts(formula_part: str) -> Dict[str, int]:
|
|
132
|
+
"""Parse element counts from a formula part like 'H2O' -> {'H': 2, 'O': 1}"""
|
|
133
|
+
elements = {}
|
|
134
|
+
i = 0
|
|
135
|
+
|
|
136
|
+
while i < len(formula_part):
|
|
137
|
+
# Get element (uppercase letter, possibly followed by lowercase)
|
|
138
|
+
element = formula_part[i]
|
|
139
|
+
i += 1
|
|
140
|
+
|
|
141
|
+
while i < len(formula_part) and formula_part[i].islower():
|
|
142
|
+
element += formula_part[i]
|
|
143
|
+
i += 1
|
|
144
|
+
|
|
145
|
+
# Get count (digits following element)
|
|
146
|
+
count_str = ""
|
|
147
|
+
while i < len(formula_part) and formula_part[i].isdigit():
|
|
148
|
+
count_str += formula_part[i]
|
|
149
|
+
i += 1
|
|
150
|
+
|
|
151
|
+
count = int(count_str) if count_str else 1
|
|
152
|
+
elements[element] = elements.get(element, 0) + count
|
|
153
|
+
|
|
154
|
+
return elements
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
def _format_adduct_name(components: List[Dict]) -> str:
|
|
158
|
+
"""Format adduct name from components like [M+H]1+ or [M+2H]2+"""
|
|
159
|
+
if not components:
|
|
160
|
+
return "[M]"
|
|
161
|
+
|
|
162
|
+
# Count occurrences of each formula
|
|
163
|
+
from collections import Counter
|
|
164
|
+
|
|
165
|
+
formula_counts = Counter(comp["formula"] for comp in components)
|
|
166
|
+
total_charge = sum(comp["charge"] for comp in components)
|
|
167
|
+
|
|
168
|
+
# Build formula part with proper multipliers
|
|
169
|
+
formula_parts = []
|
|
170
|
+
for formula, count in sorted(
|
|
171
|
+
formula_counts.items(),
|
|
172
|
+
): # Sort for consistent ordering
|
|
173
|
+
if count == 1:
|
|
174
|
+
formula_parts.append(formula)
|
|
175
|
+
else:
|
|
176
|
+
# For multiple occurrences, use count prefix (e.g., 2H, 3Na)
|
|
177
|
+
# Handle special case where formula might already start with + or -
|
|
178
|
+
if formula.startswith(("+", "-")):
|
|
179
|
+
sign = formula[0]
|
|
180
|
+
base_formula = formula[1:]
|
|
181
|
+
formula_parts.append(f"{sign}{count}{base_formula}")
|
|
182
|
+
else:
|
|
183
|
+
formula_parts.append(f"{count}{formula}")
|
|
184
|
+
|
|
185
|
+
# Combine formula parts
|
|
186
|
+
formula = "".join(formula_parts)
|
|
187
|
+
|
|
188
|
+
# Format charge
|
|
189
|
+
if total_charge == 0:
|
|
190
|
+
charge_str = ""
|
|
191
|
+
elif abs(total_charge) == 1:
|
|
192
|
+
charge_str = "1+" if total_charge > 0 else "1-"
|
|
193
|
+
else:
|
|
194
|
+
charge_str = (
|
|
195
|
+
f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
196
|
+
)
|
|
197
|
+
|
|
198
|
+
return f"[M{formula}]{charge_str}"
|
|
199
|
+
|
|
200
|
+
|
|
61
201
|
class Lib:
|
|
62
202
|
"""
|
|
63
203
|
A class for managing compound libraries and feature annotation in mass spectrometry data.
|
|
@@ -89,22 +229,21 @@ class Lib:
|
|
|
89
229
|
>>> print(f"Loaded {len(lib.lib_df)} library entries")
|
|
90
230
|
"""
|
|
91
231
|
|
|
92
|
-
#
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
"[M-2H]2-": {"delta_m": -2.014552, "delta_z": -2, "polarity": "negative"},
|
|
232
|
+
# Default adduct definitions using OpenMS format
|
|
233
|
+
DEFAULT_ADDUCTS = {
|
|
234
|
+
"positive": [
|
|
235
|
+
"+H:1:0.65",
|
|
236
|
+
"+Na:1:0.15",
|
|
237
|
+
"+K:1:0.05",
|
|
238
|
+
"+NH4:1:0.15",
|
|
239
|
+
"-H2O:0:0.15",
|
|
240
|
+
],
|
|
241
|
+
"negative": [
|
|
242
|
+
"-H:-1:0.9",
|
|
243
|
+
"+Cl:-1:0.1",
|
|
244
|
+
"+CH2O2:0:0.15",
|
|
245
|
+
"-H2O:0:0.15",
|
|
246
|
+
]
|
|
108
247
|
}
|
|
109
248
|
|
|
110
249
|
def __init__(self):
|
|
@@ -119,12 +258,15 @@ class Lib:
|
|
|
119
258
|
"cmpd_uid": pl.Series([], dtype=pl.Int64),
|
|
120
259
|
"source_id": pl.Series([], dtype=pl.Utf8),
|
|
121
260
|
"name": pl.Series([], dtype=pl.Utf8),
|
|
261
|
+
"shortname": pl.Series([], dtype=pl.Utf8),
|
|
262
|
+
"class": pl.Series([], dtype=pl.Utf8),
|
|
122
263
|
"smiles": pl.Series([], dtype=pl.Utf8),
|
|
123
264
|
"inchi": pl.Series([], dtype=pl.Utf8),
|
|
124
265
|
"inchikey": pl.Series([], dtype=pl.Utf8),
|
|
125
266
|
"formula": pl.Series([], dtype=pl.Utf8),
|
|
126
267
|
"iso": pl.Series([], dtype=pl.Int64),
|
|
127
268
|
"adduct": pl.Series([], dtype=pl.Utf8),
|
|
269
|
+
"probability": pl.Series([], dtype=pl.Float64),
|
|
128
270
|
"m": pl.Series([], dtype=pl.Float64),
|
|
129
271
|
"z": pl.Series([], dtype=pl.Int8),
|
|
130
272
|
"mz": pl.Series([], dtype=pl.Float64),
|
|
@@ -134,6 +276,177 @@ class Lib:
|
|
|
134
276
|
"db": pl.Series([], dtype=pl.Utf8),
|
|
135
277
|
})
|
|
136
278
|
|
|
279
|
+
def _get_adducts(self,
|
|
280
|
+
adducts_list: Optional[List[str]] = None,
|
|
281
|
+
polarity: Optional[str] = None,
|
|
282
|
+
min_probability: float = 0.03,
|
|
283
|
+
**kwargs) -> pl.DataFrame:
|
|
284
|
+
"""
|
|
285
|
+
Generate comprehensive adduct specifications for the library.
|
|
286
|
+
|
|
287
|
+
This method creates a DataFrame of adduct combinations following the same
|
|
288
|
+
syntax as Study() and Sample() classes.
|
|
289
|
+
|
|
290
|
+
Args:
|
|
291
|
+
adducts_list: List of adduct specifications in OpenMS format (e.g., "+H:1:0.65")
|
|
292
|
+
polarity: "positive", "negative", or None for both
|
|
293
|
+
min_probability: Minimum probability threshold to filter adducts
|
|
294
|
+
**kwargs: Additional parameters for adduct generation
|
|
295
|
+
|
|
296
|
+
Returns:
|
|
297
|
+
DataFrame with columns:
|
|
298
|
+
- name: Formatted adduct name like "[M+H]1+"
|
|
299
|
+
- charge: Total charge of the adduct
|
|
300
|
+
- mass_shift: Total mass shift in Da
|
|
301
|
+
- probability: Combined probability score
|
|
302
|
+
- complexity: Number of adduct components
|
|
303
|
+
"""
|
|
304
|
+
# Get adduct specifications
|
|
305
|
+
if adducts_list is None:
|
|
306
|
+
if polarity is None:
|
|
307
|
+
# Use positive by default
|
|
308
|
+
adducts_list = self.DEFAULT_ADDUCTS["positive"]
|
|
309
|
+
elif polarity.lower() in ["positive", "pos"]:
|
|
310
|
+
adducts_list = self.DEFAULT_ADDUCTS["positive"]
|
|
311
|
+
elif polarity.lower() in ["negative", "neg"]:
|
|
312
|
+
adducts_list = self.DEFAULT_ADDUCTS["negative"]
|
|
313
|
+
else:
|
|
314
|
+
raise ValueError(f"Unknown polarity: {polarity}")
|
|
315
|
+
|
|
316
|
+
# Parameters
|
|
317
|
+
charge_min = kwargs.get("charge_min", -2)
|
|
318
|
+
charge_max = kwargs.get("charge_max", 2)
|
|
319
|
+
max_combinations = kwargs.get("max_combinations", 2)
|
|
320
|
+
|
|
321
|
+
# Parse base adduct specifications
|
|
322
|
+
base_specs = []
|
|
323
|
+
|
|
324
|
+
for adduct_str in adducts_list:
|
|
325
|
+
if not isinstance(adduct_str, str) or ":" not in adduct_str:
|
|
326
|
+
continue
|
|
327
|
+
|
|
328
|
+
try:
|
|
329
|
+
parts = adduct_str.split(":")
|
|
330
|
+
if len(parts) != 3:
|
|
331
|
+
continue
|
|
332
|
+
|
|
333
|
+
formula_part = parts[0]
|
|
334
|
+
charge = int(parts[1])
|
|
335
|
+
probability = float(parts[2])
|
|
336
|
+
|
|
337
|
+
# Calculate mass shift from formula
|
|
338
|
+
mass_shift = _calculate_formula_mass_shift(formula_part)
|
|
339
|
+
|
|
340
|
+
base_specs.append({
|
|
341
|
+
"formula": formula_part,
|
|
342
|
+
"charge": charge,
|
|
343
|
+
"mass_shift": mass_shift,
|
|
344
|
+
"probability": probability,
|
|
345
|
+
"raw_string": adduct_str,
|
|
346
|
+
})
|
|
347
|
+
|
|
348
|
+
except (ValueError, IndexError):
|
|
349
|
+
continue
|
|
350
|
+
|
|
351
|
+
if not base_specs:
|
|
352
|
+
# Return empty DataFrame with correct schema
|
|
353
|
+
return pl.DataFrame({
|
|
354
|
+
"name": [],
|
|
355
|
+
"charge": [],
|
|
356
|
+
"mass_shift": [],
|
|
357
|
+
"probability": [],
|
|
358
|
+
"complexity": [],
|
|
359
|
+
})
|
|
360
|
+
|
|
361
|
+
# Generate all valid combinations
|
|
362
|
+
combinations_list = []
|
|
363
|
+
|
|
364
|
+
# Separate specs by charge type
|
|
365
|
+
positive_specs = [spec for spec in base_specs if spec["charge"] > 0]
|
|
366
|
+
negative_specs = [spec for spec in base_specs if spec["charge"] < 0]
|
|
367
|
+
neutral_specs = [spec for spec in base_specs if spec["charge"] == 0]
|
|
368
|
+
|
|
369
|
+
# 1. Single adducts
|
|
370
|
+
for spec in base_specs:
|
|
371
|
+
if charge_min <= spec["charge"] <= charge_max:
|
|
372
|
+
formatted_name = _format_adduct_name([spec])
|
|
373
|
+
combinations_list.append({
|
|
374
|
+
"components": [spec],
|
|
375
|
+
"formatted_name": formatted_name,
|
|
376
|
+
"total_mass_shift": spec["mass_shift"],
|
|
377
|
+
"total_charge": spec["charge"],
|
|
378
|
+
"combined_probability": spec["probability"],
|
|
379
|
+
"complexity": 1,
|
|
380
|
+
})
|
|
381
|
+
|
|
382
|
+
# 2. Generate multiply charged versions (2H+, 3H+, etc.)
|
|
383
|
+
if max_combinations >= 2:
|
|
384
|
+
for spec in positive_specs + negative_specs:
|
|
385
|
+
base_charge = spec["charge"]
|
|
386
|
+
for multiplier in range(2, min(max_combinations + 1, 4)):
|
|
387
|
+
total_charge = base_charge * multiplier
|
|
388
|
+
if charge_min <= total_charge <= charge_max:
|
|
389
|
+
components = [spec] * multiplier
|
|
390
|
+
formatted_name = _format_adduct_name(components)
|
|
391
|
+
|
|
392
|
+
combinations_list.append({
|
|
393
|
+
"components": components,
|
|
394
|
+
"formatted_name": formatted_name,
|
|
395
|
+
"total_mass_shift": spec["mass_shift"] * multiplier,
|
|
396
|
+
"total_charge": total_charge,
|
|
397
|
+
"combined_probability": spec["probability"] ** multiplier,
|
|
398
|
+
"complexity": multiplier,
|
|
399
|
+
})
|
|
400
|
+
|
|
401
|
+
# 3. Mixed combinations (positive + neutral)
|
|
402
|
+
if max_combinations >= 2:
|
|
403
|
+
for pos_spec in positive_specs:
|
|
404
|
+
for neut_spec in neutral_specs:
|
|
405
|
+
total_charge = pos_spec["charge"] + neut_spec["charge"]
|
|
406
|
+
if charge_min <= total_charge <= charge_max:
|
|
407
|
+
components = [pos_spec, neut_spec]
|
|
408
|
+
formatted_name = _format_adduct_name(components)
|
|
409
|
+
combinations_list.append({
|
|
410
|
+
"components": components,
|
|
411
|
+
"formatted_name": formatted_name,
|
|
412
|
+
"total_mass_shift": pos_spec["mass_shift"] + neut_spec["mass_shift"],
|
|
413
|
+
"total_charge": total_charge,
|
|
414
|
+
"combined_probability": pos_spec["probability"] * neut_spec["probability"],
|
|
415
|
+
"complexity": 2,
|
|
416
|
+
})
|
|
417
|
+
|
|
418
|
+
# Convert to polars DataFrame
|
|
419
|
+
if combinations_list:
|
|
420
|
+
combinations_list.sort(
|
|
421
|
+
key=lambda x: (-x["combined_probability"], x["complexity"])
|
|
422
|
+
)
|
|
423
|
+
|
|
424
|
+
adducts_df = pl.DataFrame([
|
|
425
|
+
{
|
|
426
|
+
"name": combo["formatted_name"],
|
|
427
|
+
"charge": combo["total_charge"],
|
|
428
|
+
"mass_shift": combo["total_mass_shift"],
|
|
429
|
+
"probability": combo["combined_probability"],
|
|
430
|
+
"complexity": combo["complexity"],
|
|
431
|
+
}
|
|
432
|
+
for combo in combinations_list
|
|
433
|
+
])
|
|
434
|
+
else:
|
|
435
|
+
# Return empty DataFrame with correct schema
|
|
436
|
+
adducts_df = pl.DataFrame({
|
|
437
|
+
"name": [],
|
|
438
|
+
"charge": [],
|
|
439
|
+
"mass_shift": [],
|
|
440
|
+
"probability": [],
|
|
441
|
+
"complexity": [],
|
|
442
|
+
})
|
|
443
|
+
|
|
444
|
+
# Filter by minimum probability
|
|
445
|
+
if min_probability > 0.0 and len(adducts_df) > 0:
|
|
446
|
+
adducts_df = adducts_df.filter(pl.col("probability") >= min_probability)
|
|
447
|
+
|
|
448
|
+
return adducts_df
|
|
449
|
+
|
|
137
450
|
def _calculate_accurate_mass(self, formula: str) -> Optional[float]:
|
|
138
451
|
"""
|
|
139
452
|
Calculate the accurate mass for a molecular formula using PyOpenMS.
|
|
@@ -185,15 +498,17 @@ class Lib:
|
|
|
185
498
|
compound_data: Dict[str, Any],
|
|
186
499
|
adducts: Optional[List[str]] = None,
|
|
187
500
|
polarity: Optional[str] = None,
|
|
188
|
-
lib_id_counter: Optional[int] = None
|
|
501
|
+
lib_id_counter: Optional[int] = None,
|
|
502
|
+
min_probability: float = 0.03) -> tuple[List[Dict[str, Any]], int]:
|
|
189
503
|
"""
|
|
190
|
-
Generate adduct variants for a given compound.
|
|
504
|
+
Generate adduct variants for a given compound using the new adduct system.
|
|
191
505
|
|
|
192
506
|
Args:
|
|
193
507
|
compound_data: Dictionary containing compound information
|
|
194
|
-
adducts: List of specific adducts to generate. If None, uses
|
|
195
|
-
polarity: Ionization polarity ("positive", "negative", or None for
|
|
508
|
+
adducts: List of specific adducts to generate. If None, uses defaults for polarity
|
|
509
|
+
polarity: Ionization polarity ("positive", "negative", or None for positive)
|
|
196
510
|
lib_id_counter: Counter for generating unique lib_uid values
|
|
511
|
+
min_probability: Minimum probability threshold for adduct filtering
|
|
197
512
|
|
|
198
513
|
Returns:
|
|
199
514
|
Tuple of (list of dictionaries representing adduct variants, updated counter)
|
|
@@ -206,35 +521,25 @@ class Lib:
|
|
|
206
521
|
if accurate_mass is None:
|
|
207
522
|
return variants, counter
|
|
208
523
|
|
|
209
|
-
#
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
217
|
-
|
|
218
|
-
if props["polarity"] == polarity.lower()
|
|
219
|
-
]
|
|
220
|
-
else:
|
|
221
|
-
selected_adducts = adducts
|
|
524
|
+
# Get adduct specifications using _get_adducts
|
|
525
|
+
adducts_df = self._get_adducts(
|
|
526
|
+
adducts_list=adducts,
|
|
527
|
+
polarity=polarity,
|
|
528
|
+
min_probability=min_probability
|
|
529
|
+
)
|
|
530
|
+
|
|
531
|
+
if len(adducts_df) == 0:
|
|
532
|
+
return variants, counter
|
|
222
533
|
|
|
223
534
|
# Generate variants for each adduct
|
|
224
|
-
for
|
|
225
|
-
|
|
226
|
-
|
|
227
|
-
|
|
228
|
-
|
|
229
|
-
adduct_props = self.ADDUCT_DEFINITIONS[adduct]
|
|
230
|
-
|
|
231
|
-
# Skip if polarity doesn't match
|
|
232
|
-
if polarity is not None and adduct_props["polarity"] != polarity.lower():
|
|
233
|
-
continue
|
|
535
|
+
for adduct_row in adducts_df.iter_rows(named=True):
|
|
536
|
+
adduct_name = adduct_row["name"]
|
|
537
|
+
charge = adduct_row["charge"]
|
|
538
|
+
mass_shift = adduct_row["mass_shift"]
|
|
539
|
+
probability = adduct_row["probability"]
|
|
234
540
|
|
|
235
541
|
# Calculate adducted mass and m/z
|
|
236
|
-
adducted_mass = accurate_mass +
|
|
237
|
-
charge = adduct_props["delta_z"]
|
|
542
|
+
adducted_mass = accurate_mass + mass_shift
|
|
238
543
|
mz = abs(adducted_mass / charge) if charge != 0 else adducted_mass
|
|
239
544
|
|
|
240
545
|
# Create variant entry
|
|
@@ -243,12 +548,15 @@ class Lib:
|
|
|
243
548
|
"cmpd_uid": compound_data.get("cmpd_uid", None),
|
|
244
549
|
"source_id": compound_data.get("source_id", None),
|
|
245
550
|
"name": compound_data.get("name", ""),
|
|
551
|
+
"shortname": compound_data.get("shortname", ""),
|
|
552
|
+
"class": compound_data.get("class", ""),
|
|
246
553
|
"smiles": compound_data.get("smiles", ""),
|
|
247
554
|
"inchi": compound_data.get("inchi", ""),
|
|
248
555
|
"inchikey": compound_data.get("inchikey", ""),
|
|
249
556
|
"formula": compound_data["formula"],
|
|
250
557
|
"iso": 0, # Default to zero
|
|
251
|
-
"adduct":
|
|
558
|
+
"adduct": adduct_name,
|
|
559
|
+
"probability": probability,
|
|
252
560
|
"m": adducted_mass,
|
|
253
561
|
"z": charge,
|
|
254
562
|
"mz": mz,
|
|
@@ -265,7 +573,8 @@ class Lib:
|
|
|
265
573
|
def import_csv(self,
|
|
266
574
|
csvfile: str,
|
|
267
575
|
polarity: Optional[str] = None,
|
|
268
|
-
adducts: Optional[List[str]] = None
|
|
576
|
+
adducts: Optional[List[str]] = None,
|
|
577
|
+
min_probability: float = 0.03) -> None:
|
|
269
578
|
"""
|
|
270
579
|
Import compound library from a CSV file.
|
|
271
580
|
|
|
@@ -274,8 +583,9 @@ class Lib:
|
|
|
274
583
|
|
|
275
584
|
Args:
|
|
276
585
|
csvfile: Path to the CSV file
|
|
277
|
-
polarity: Ionization polarity ("positive", "negative", or None for
|
|
278
|
-
adducts: Specific adducts to generate. If None, generates
|
|
586
|
+
polarity: Ionization polarity ("positive", "negative", or None for positive)
|
|
587
|
+
adducts: Specific adducts to generate. If None, generates defaults for the polarity
|
|
588
|
+
min_probability: Minimum probability threshold for adduct filtering
|
|
279
589
|
|
|
280
590
|
Expected CSV columns (case-insensitive):
|
|
281
591
|
- Required: Formula (or formula)
|
|
@@ -319,6 +629,8 @@ class Lib:
|
|
|
319
629
|
|
|
320
630
|
compound_data = {
|
|
321
631
|
"name": row.get(column_mapping.get("name", ""), ""),
|
|
632
|
+
"shortname": row.get(column_mapping.get("shortname", ""), ""),
|
|
633
|
+
"class": row.get(column_mapping.get("class", ""), ""),
|
|
322
634
|
"smiles": row.get(column_mapping.get("smiles", ""), ""),
|
|
323
635
|
"inchi": row.get(column_mapping.get("inchi", ""), ""),
|
|
324
636
|
"inchikey": row.get(column_mapping.get("inchikey", ""), ""),
|
|
@@ -331,7 +643,8 @@ class Lib:
|
|
|
331
643
|
|
|
332
644
|
# Generate adduct variants
|
|
333
645
|
variants, lib_id_counter = self._generate_adduct_variants(
|
|
334
|
-
compound_data, adducts=adducts, polarity=polarity,
|
|
646
|
+
compound_data, adducts=adducts, polarity=polarity,
|
|
647
|
+
lib_id_counter=lib_id_counter, min_probability=min_probability
|
|
335
648
|
)
|
|
336
649
|
all_variants.extend(variants)
|
|
337
650
|
|
|
@@ -349,7 +662,8 @@ class Lib:
|
|
|
349
662
|
compound_data_rt2["name"] = compound_data["name"] + " II"
|
|
350
663
|
|
|
351
664
|
variants_rt2, lib_id_counter = self._generate_adduct_variants(
|
|
352
|
-
compound_data_rt2, adducts=adducts, polarity=polarity,
|
|
665
|
+
compound_data_rt2, adducts=adducts, polarity=polarity,
|
|
666
|
+
lib_id_counter=lib_id_counter, min_probability=min_probability
|
|
353
667
|
)
|
|
354
668
|
all_variants.extend(variants_rt2)
|
|
355
669
|
|
|
@@ -529,6 +843,8 @@ class Lib:
|
|
|
529
843
|
"cmpd_uid": match_row.get("cmpd_uid"),
|
|
530
844
|
"source_id": match_row.get("source_id"),
|
|
531
845
|
"name": match_row["name"],
|
|
846
|
+
"shortname": match_row["shortname"],
|
|
847
|
+
"class": match_row["class"],
|
|
532
848
|
"formula": match_row["formula"],
|
|
533
849
|
"iso": match_row.get("iso", 0),
|
|
534
850
|
"adduct": match_row["adduct"],
|
|
@@ -555,10 +871,8 @@ class Lib:
|
|
|
555
871
|
Returns:
|
|
556
872
|
List of adduct names
|
|
557
873
|
"""
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
if props["polarity"] == polarity.lower()
|
|
561
|
-
]
|
|
874
|
+
adducts_df = self._get_adducts(polarity=polarity, min_probability=0.0)
|
|
875
|
+
return adducts_df.select("name").to_series().to_list()
|
|
562
876
|
|
|
563
877
|
def __len__(self) -> int:
|
|
564
878
|
"""Return number of library entries."""
|
masster/sample/adducts.py
CHANGED
|
@@ -107,7 +107,9 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
107
107
|
|
|
108
108
|
# 1. Single adducts
|
|
109
109
|
for spec in base_specs:
|
|
110
|
-
|
|
110
|
+
# For neutral adducts (charge=0), always allow them
|
|
111
|
+
# For charged adducts, check if absolute value is within range
|
|
112
|
+
if spec["charge"] == 0 or (charge_min <= abs(spec["charge"]) <= charge_max):
|
|
111
113
|
formatted_name = _format_adduct_name([spec])
|
|
112
114
|
combinations_list.append(
|
|
113
115
|
{
|
|
@@ -125,7 +127,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
125
127
|
base_charge = spec["charge"]
|
|
126
128
|
for multiplier in range(2, min(max_combinations + 1, 5)):
|
|
127
129
|
total_charge = base_charge * multiplier
|
|
128
|
-
if charge_min <= total_charge <= charge_max:
|
|
130
|
+
if charge_min <= abs(total_charge) <= charge_max:
|
|
129
131
|
components = [spec] * multiplier
|
|
130
132
|
formatted_name = _format_adduct_name(components)
|
|
131
133
|
|
|
@@ -146,7 +148,8 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
146
148
|
for pos_spec in positive_specs:
|
|
147
149
|
for neut_spec in neutral_specs:
|
|
148
150
|
total_charge = pos_spec["charge"] + neut_spec["charge"]
|
|
149
|
-
|
|
151
|
+
# For combinations with neutrals, the total charge should follow abs() rule only if non-zero
|
|
152
|
+
if total_charge == 0 or (charge_min <= abs(total_charge) <= charge_max):
|
|
150
153
|
components = [pos_spec, neut_spec]
|
|
151
154
|
formatted_name = _format_adduct_name(components)
|
|
152
155
|
combinations_list.append(
|
|
@@ -166,7 +169,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
166
169
|
for combo in combinations(positive_specs, 2):
|
|
167
170
|
if combo[0]["formula"] != combo[1]["formula"]:
|
|
168
171
|
total_charge = combo[0]["charge"] + combo[1]["charge"]
|
|
169
|
-
if charge_min <= total_charge <= charge_max:
|
|
172
|
+
if total_charge == 0 or (charge_min <= abs(total_charge) <= charge_max):
|
|
170
173
|
components = list(combo)
|
|
171
174
|
formatted_name = _format_adduct_name(components)
|
|
172
175
|
combinations_list.append(
|
|
@@ -189,7 +192,7 @@ def _get_adducts(self, adducts_list: list = None, **kwargs):
|
|
|
189
192
|
components = [pos_spec] + list(neut_combo)
|
|
190
193
|
total_charge = sum(spec["charge"] for spec in components)
|
|
191
194
|
|
|
192
|
-
if charge_min <= total_charge <= charge_max:
|
|
195
|
+
if total_charge == 0 or (charge_min <= abs(total_charge) <= charge_max):
|
|
193
196
|
formatted_name = _format_adduct_name(components)
|
|
194
197
|
total_mass_shift = sum(spec["mass_shift"] for spec in components)
|
|
195
198
|
combined_prob = np.prod(
|
masster/sample/processing.py
CHANGED
|
@@ -586,6 +586,12 @@ def find_features(self, **kwargs):
|
|
|
586
586
|
self.logger.debug(
|
|
587
587
|
f"Parameters: chrom_fwhm={params.get('chrom_fwhm')}, noise={params.get('noise')}, tol_ppm={params.get('tol_ppm')}, isotope_filtering_model={params.get('isotope_filtering_model')}",
|
|
588
588
|
)
|
|
589
|
+
# check that noise is not lower than 1% quantile of ms1_df inty
|
|
590
|
+
noise_threshold = self.ms1_df.select(pl.col("inty")).quantile(0.01)[0, 0]
|
|
591
|
+
if params.get("noise") < noise_threshold / 10:
|
|
592
|
+
self.logger.warning(
|
|
593
|
+
f"Warning: noise threshold {params.get('noise')} is lower than 1% quantile of MS1 intensities ({noise_threshold:.1f}). This may lead to many false positives.",
|
|
594
|
+
)
|
|
589
595
|
|
|
590
596
|
exp = oms.MSExperiment()
|
|
591
597
|
# find max number of cycles in self.ms1_df
|
masster/study/helpers.py
CHANGED
|
@@ -490,6 +490,7 @@ def align_reset(self):
|
|
|
490
490
|
# Ensure column order is maintained after with_columns operation
|
|
491
491
|
from masster.study.helpers import _ensure_features_df_schema_order
|
|
492
492
|
_ensure_features_df_schema_order(self)
|
|
493
|
+
self.logger.info("Alignment reset: all feature RTs set to original_RT.")
|
|
493
494
|
|
|
494
495
|
|
|
495
496
|
# =====================================================================================
|