masster 0.5.22__py3-none-any.whl → 0.5.24__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of masster might be problematic. Click here for more details.
- masster/_version.py +1 -1
- masster/logger.py +35 -19
- masster/sample/adducts.py +15 -29
- masster/sample/defaults/find_adducts_def.py +1 -3
- masster/sample/defaults/sample_def.py +4 -4
- masster/sample/h5.py +203 -361
- masster/sample/helpers.py +14 -30
- masster/sample/lib.py +3 -3
- masster/sample/load.py +21 -29
- masster/sample/plot.py +222 -132
- masster/sample/processing.py +42 -55
- masster/sample/sample.py +37 -46
- masster/sample/save.py +37 -61
- masster/sample/sciex.py +13 -11
- masster/sample/thermo.py +69 -74
- masster/spectrum.py +15 -15
- masster/study/analysis.py +650 -586
- masster/study/defaults/identify_def.py +1 -3
- masster/study/defaults/merge_def.py +6 -7
- masster/study/defaults/study_def.py +1 -5
- masster/study/export.py +35 -96
- masster/study/h5.py +134 -211
- masster/study/helpers.py +385 -459
- masster/study/id.py +239 -290
- masster/study/importers.py +84 -93
- masster/study/load.py +159 -178
- masster/study/merge.py +1112 -1098
- masster/study/plot.py +195 -149
- masster/study/processing.py +144 -191
- masster/study/save.py +14 -13
- masster/study/study.py +89 -130
- masster/wizard/wizard.py +764 -714
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/METADATA +27 -1
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/RECORD +37 -37
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/WHEEL +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/entry_points.txt +0 -0
- {masster-0.5.22.dist-info → masster-0.5.24.dist-info}/licenses/LICENSE +0 -0
masster/study/id.py
CHANGED
|
@@ -60,11 +60,11 @@ def lib_load(
|
|
|
60
60
|
)
|
|
61
61
|
|
|
62
62
|
lib_obj = Lib()
|
|
63
|
-
|
|
63
|
+
|
|
64
64
|
# Determine file type by extension
|
|
65
|
-
if lib_source.lower().endswith(
|
|
65
|
+
if lib_source.lower().endswith(".json"):
|
|
66
66
|
lib_obj.import_json(lib_source, polarity=polarity, adducts=adducts)
|
|
67
|
-
elif lib_source.lower().endswith(
|
|
67
|
+
elif lib_source.lower().endswith(".csv"):
|
|
68
68
|
lib_obj.import_csv(lib_source, polarity=polarity, adducts=adducts)
|
|
69
69
|
else:
|
|
70
70
|
# Default to CSV behavior for backward compatibility
|
|
@@ -112,15 +112,13 @@ def lib_load(
|
|
|
112
112
|
# Add source_id column with filename (without path) if loading from CSV/JSON
|
|
113
113
|
if isinstance(lib_source, str):
|
|
114
114
|
import os
|
|
115
|
+
|
|
115
116
|
filename_only = os.path.basename(lib_source)
|
|
116
117
|
filtered_lf = filtered_lf.with_columns(pl.lit(filename_only).alias("source_id"))
|
|
117
118
|
|
|
118
119
|
# Ensure required columns exist and set correct values
|
|
119
|
-
required_columns = {
|
|
120
|
-
|
|
121
|
-
"iso": pl.Int64
|
|
122
|
-
}
|
|
123
|
-
|
|
120
|
+
required_columns = {"quant_group": pl.Int64, "iso": pl.Int64}
|
|
121
|
+
|
|
124
122
|
for col_name, col_dtype in required_columns.items():
|
|
125
123
|
if col_name == "quant_group":
|
|
126
124
|
# Set quant_group using cmpd_uid (same for isotopomers of same compound)
|
|
@@ -133,21 +131,24 @@ def lib_load(
|
|
|
133
131
|
if col_name not in filtered_lf.columns:
|
|
134
132
|
# Default to zero for iso
|
|
135
133
|
filtered_lf = filtered_lf.with_columns(pl.lit(0).cast(col_dtype).alias(col_name))
|
|
136
|
-
|
|
134
|
+
|
|
137
135
|
# Generate 13C isotopes if requested
|
|
138
136
|
original_count = len(filtered_lf)
|
|
139
|
-
if iso ==
|
|
137
|
+
if iso == "13C":
|
|
140
138
|
filtered_lf = _generate_13c_isotopes(filtered_lf)
|
|
141
139
|
# Update the log message to show the correct count after isotope generation
|
|
142
140
|
if isinstance(lib_source, str):
|
|
143
141
|
import os
|
|
142
|
+
|
|
144
143
|
filename_only = os.path.basename(lib_source)
|
|
145
|
-
print(
|
|
144
|
+
print(
|
|
145
|
+
f"Generated 13C isotopes: {len(filtered_lf)} total entries ({original_count} original + {len(filtered_lf) - original_count} isotopes) from {filename_only}"
|
|
146
|
+
)
|
|
146
147
|
|
|
147
148
|
# Reorder columns to place quant_group after rt and iso after formula
|
|
148
149
|
column_order = []
|
|
149
150
|
columns_list = list(filtered_lf.columns)
|
|
150
|
-
|
|
151
|
+
|
|
151
152
|
for col in columns_list:
|
|
152
153
|
if col not in column_order: # Only add if not already added
|
|
153
154
|
column_order.append(col)
|
|
@@ -156,22 +157,17 @@ def lib_load(
|
|
|
156
157
|
elif col == "formula" and "iso" in columns_list and "iso" not in column_order:
|
|
157
158
|
column_order.append("iso")
|
|
158
159
|
|
|
159
|
-
|
|
160
160
|
# Add to existing lib_df instead of replacing
|
|
161
|
-
if (
|
|
162
|
-
hasattr(study, "lib_df")
|
|
163
|
-
and study.lib_df is not None
|
|
164
|
-
and not study.lib_df.is_empty()
|
|
165
|
-
):
|
|
161
|
+
if hasattr(study, "lib_df") and study.lib_df is not None and not study.lib_df.is_empty():
|
|
166
162
|
# Check for schema compatibility and handle mismatches
|
|
167
163
|
existing_cols = set(study.lib_df.columns)
|
|
168
164
|
new_cols = set(filtered_lf.columns)
|
|
169
|
-
|
|
165
|
+
|
|
170
166
|
# If schemas don't match, we need to align them
|
|
171
167
|
if existing_cols != new_cols:
|
|
172
168
|
# Get union of all columns
|
|
173
169
|
all_cols = existing_cols.union(new_cols)
|
|
174
|
-
|
|
170
|
+
|
|
175
171
|
# Add missing columns to existing data with appropriate defaults
|
|
176
172
|
for col in new_cols - existing_cols:
|
|
177
173
|
if col == "probability":
|
|
@@ -180,10 +176,12 @@ def lib_load(
|
|
|
180
176
|
try:
|
|
181
177
|
adduct_prob_map = _get_adduct_probabilities(study)
|
|
182
178
|
study.lib_df = study.lib_df.with_columns(
|
|
183
|
-
pl.col("adduct")
|
|
179
|
+
pl.col("adduct")
|
|
180
|
+
.map_elements(
|
|
184
181
|
lambda adduct: adduct_prob_map.get(adduct, 1.0) if adduct is not None else 1.0,
|
|
185
|
-
return_dtype=pl.Float64
|
|
186
|
-
)
|
|
182
|
+
return_dtype=pl.Float64,
|
|
183
|
+
)
|
|
184
|
+
.alias("probability")
|
|
187
185
|
)
|
|
188
186
|
except Exception:
|
|
189
187
|
study.lib_df = study.lib_df.with_columns(pl.lit(1.0).alias("probability"))
|
|
@@ -200,16 +198,16 @@ def lib_load(
|
|
|
200
198
|
else:
|
|
201
199
|
# Default to null for other columns
|
|
202
200
|
study.lib_df = study.lib_df.with_columns(pl.lit(None).alias(col))
|
|
203
|
-
|
|
201
|
+
|
|
204
202
|
# Add missing columns to new data with appropriate defaults
|
|
205
203
|
for col in existing_cols - new_cols:
|
|
206
204
|
if col not in ["probability", "iso", "quant_group"]: # These should already be handled
|
|
207
205
|
filtered_lf = filtered_lf.with_columns(pl.lit(None).alias(col))
|
|
208
|
-
|
|
206
|
+
|
|
209
207
|
# Ensure column order matches for concatenation - use existing column order
|
|
210
208
|
existing_column_order = list(study.lib_df.columns)
|
|
211
209
|
filtered_lf = filtered_lf.select(existing_column_order)
|
|
212
|
-
|
|
210
|
+
|
|
213
211
|
# Concatenate with existing data
|
|
214
212
|
study.lib_df = pl.concat([study.lib_df, filtered_lf])
|
|
215
213
|
else:
|
|
@@ -218,14 +216,14 @@ def lib_load(
|
|
|
218
216
|
study.lib_df = (
|
|
219
217
|
filtered_lf.clone()
|
|
220
218
|
if hasattr(filtered_lf, "clone")
|
|
221
|
-
else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf,
|
|
219
|
+
else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
|
|
222
220
|
)
|
|
223
221
|
except Exception:
|
|
224
222
|
try:
|
|
225
223
|
study.lib_df = (
|
|
226
224
|
pl.from_pandas(filtered_lf)
|
|
227
225
|
if hasattr(filtered_lf, "to_pandas")
|
|
228
|
-
else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf,
|
|
226
|
+
else pl.DataFrame(filtered_lf.to_dict() if hasattr(filtered_lf, "to_dict") else filtered_lf)
|
|
229
227
|
)
|
|
230
228
|
except Exception:
|
|
231
229
|
study.lib_df = pl.DataFrame()
|
|
@@ -265,20 +263,17 @@ def _setup_identify_parameters(params, kwargs):
|
|
|
265
263
|
# Override parameters with any provided kwargs
|
|
266
264
|
if kwargs:
|
|
267
265
|
# Handle parameter name mapping for backwards compatibility
|
|
268
|
-
param_mapping = {
|
|
269
|
-
|
|
270
|
-
'mz_tolerance': 'mz_tol'
|
|
271
|
-
}
|
|
272
|
-
|
|
266
|
+
param_mapping = {"rt_tolerance": "rt_tol", "mz_tolerance": "mz_tol"}
|
|
267
|
+
|
|
273
268
|
for param_name, value in kwargs.items():
|
|
274
269
|
# Check if we need to map the parameter name
|
|
275
270
|
mapped_name = param_mapping.get(param_name, param_name)
|
|
276
|
-
|
|
271
|
+
|
|
277
272
|
if hasattr(params, mapped_name):
|
|
278
273
|
setattr(params, mapped_name, value)
|
|
279
274
|
elif hasattr(params, param_name):
|
|
280
275
|
setattr(params, param_name, value)
|
|
281
|
-
|
|
276
|
+
|
|
282
277
|
return params
|
|
283
278
|
|
|
284
279
|
|
|
@@ -287,9 +282,7 @@ def _smart_reset_id_results(study, target_uids, logger):
|
|
|
287
282
|
if target_uids is not None:
|
|
288
283
|
# Selective reset: only clear results for features being re-identified
|
|
289
284
|
if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
|
|
290
|
-
study.id_df = study.id_df.filter(
|
|
291
|
-
~pl.col("consensus_uid").is_in(target_uids)
|
|
292
|
-
)
|
|
285
|
+
study.id_df = study.id_df.filter(~pl.col("consensus_uid").is_in(target_uids))
|
|
293
286
|
if logger:
|
|
294
287
|
logger.debug(f"Cleared previous results for {len(target_uids)} specific features")
|
|
295
288
|
elif not hasattr(study, "id_df"):
|
|
@@ -305,21 +298,23 @@ def _get_cached_adduct_probabilities(study, logger):
|
|
|
305
298
|
"""Get adduct probabilities with caching to avoid repeated expensive computation."""
|
|
306
299
|
# Check if we have cached results and cache key matches current parameters
|
|
307
300
|
current_cache_key = _get_adduct_cache_key(study)
|
|
308
|
-
|
|
309
|
-
if (
|
|
310
|
-
hasattr(study,
|
|
311
|
-
study
|
|
301
|
+
|
|
302
|
+
if (
|
|
303
|
+
hasattr(study, "_cached_adduct_probs")
|
|
304
|
+
and hasattr(study, "_cached_adduct_key")
|
|
305
|
+
and study._cached_adduct_key == current_cache_key
|
|
306
|
+
):
|
|
312
307
|
if logger:
|
|
313
308
|
logger.debug("Using cached adduct probabilities")
|
|
314
309
|
return study._cached_adduct_probs
|
|
315
|
-
|
|
310
|
+
|
|
316
311
|
# Compute and cache
|
|
317
312
|
if logger:
|
|
318
313
|
logger.debug("Computing adduct probabilities...")
|
|
319
314
|
adduct_prob_map = _get_adduct_probabilities(study)
|
|
320
315
|
study._cached_adduct_probs = adduct_prob_map
|
|
321
316
|
study._cached_adduct_key = current_cache_key
|
|
322
|
-
|
|
317
|
+
|
|
323
318
|
if logger:
|
|
324
319
|
logger.debug(f"Computed and cached probabilities for {len(adduct_prob_map)} adducts")
|
|
325
320
|
return adduct_prob_map
|
|
@@ -327,28 +322,30 @@ def _get_cached_adduct_probabilities(study, logger):
|
|
|
327
322
|
|
|
328
323
|
def _get_adduct_cache_key(study):
|
|
329
324
|
"""Generate a cache key based on adduct-related parameters."""
|
|
330
|
-
if hasattr(study,
|
|
331
|
-
adducts_str =
|
|
332
|
-
min_prob = getattr(study.parameters,
|
|
325
|
+
if hasattr(study, "parameters") and hasattr(study.parameters, "adducts"):
|
|
326
|
+
adducts_str = "|".join(sorted(study.parameters.adducts)) if study.parameters.adducts else ""
|
|
327
|
+
min_prob = getattr(study.parameters, "adduct_min_probability", 0.04)
|
|
333
328
|
return f"adducts:{adducts_str}:min_prob:{min_prob}"
|
|
334
329
|
return "default"
|
|
335
330
|
|
|
336
331
|
|
|
337
332
|
def clear_identification_cache(study):
|
|
338
333
|
"""Clear cached identification data (useful when parameters change)."""
|
|
339
|
-
cache_attrs = [
|
|
334
|
+
cache_attrs = ["_cached_adduct_probs", "_cached_adduct_key"]
|
|
340
335
|
for attr in cache_attrs:
|
|
341
336
|
if hasattr(study, attr):
|
|
342
337
|
delattr(study, attr)
|
|
343
338
|
|
|
344
339
|
|
|
345
|
-
def _perform_identification_matching(
|
|
340
|
+
def _perform_identification_matching(
|
|
341
|
+
consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
|
|
342
|
+
):
|
|
346
343
|
"""Perform optimized identification matching using vectorized operations where possible."""
|
|
347
344
|
results = []
|
|
348
|
-
|
|
345
|
+
|
|
349
346
|
# Get library data as arrays for faster access
|
|
350
347
|
lib_df = study.lib_df
|
|
351
|
-
|
|
348
|
+
|
|
352
349
|
if logger:
|
|
353
350
|
consensus_count = len(consensus_to_process)
|
|
354
351
|
lib_count = len(lib_df)
|
|
@@ -361,7 +358,7 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
|
|
|
361
358
|
cons_uid = cons_row.get("consensus_uid")
|
|
362
359
|
cons_mz = cons_row.get("mz")
|
|
363
360
|
cons_rt = cons_row.get("rt")
|
|
364
|
-
|
|
361
|
+
|
|
365
362
|
if cons_mz is None:
|
|
366
363
|
if logger:
|
|
367
364
|
logger.debug(f"Skipping consensus feature {cons_uid} - no m/z value")
|
|
@@ -372,18 +369,14 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
|
|
|
372
369
|
matches = _find_matches_vectorized(
|
|
373
370
|
lib_df, cons_mz, cons_rt, effective_mz_tol, effective_rt_tol, logger, cons_uid
|
|
374
371
|
)
|
|
375
|
-
|
|
372
|
+
|
|
376
373
|
# Convert matches to result format
|
|
377
374
|
match_results = []
|
|
378
375
|
if not matches.is_empty():
|
|
379
376
|
for match_row in matches.iter_rows(named=True):
|
|
380
377
|
mz_delta = abs(cons_mz - match_row.get("mz")) if match_row.get("mz") is not None else None
|
|
381
378
|
lib_rt = match_row.get("rt")
|
|
382
|
-
rt_delta = (
|
|
383
|
-
abs(cons_rt - lib_rt)
|
|
384
|
-
if (cons_rt is not None and lib_rt is not None)
|
|
385
|
-
else None
|
|
386
|
-
)
|
|
379
|
+
rt_delta = abs(cons_rt - lib_rt) if (cons_rt is not None and lib_rt is not None) else None
|
|
387
380
|
|
|
388
381
|
# Get library probability as base score, then multiply by adduct probability
|
|
389
382
|
lib_probability = match_row.get("probability", 1.0) if match_row.get("probability") is not None else 1.0
|
|
@@ -400,22 +393,20 @@ def _perform_identification_matching(consensus_to_process, study, effective_mz_t
|
|
|
400
393
|
"matcher": "ms1",
|
|
401
394
|
"score": score,
|
|
402
395
|
})
|
|
403
|
-
|
|
396
|
+
|
|
404
397
|
results.append({"consensus_uid": cons_uid, "matches": match_results})
|
|
405
|
-
|
|
398
|
+
|
|
406
399
|
return results
|
|
407
400
|
|
|
408
401
|
|
|
409
402
|
def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, cons_uid):
|
|
410
403
|
"""
|
|
411
404
|
Find library matches using optimized vectorized operations.
|
|
412
|
-
|
|
405
|
+
|
|
413
406
|
FIXED VERSION: Prevents incorrect matching of same compound to different m/z values.
|
|
414
407
|
"""
|
|
415
408
|
# Filter by m/z tolerance using vectorized operations
|
|
416
|
-
matches = lib_df.filter(
|
|
417
|
-
(pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol)
|
|
418
|
-
)
|
|
409
|
+
matches = lib_df.filter((pl.col("mz") >= cons_mz - mz_tol) & (pl.col("mz") <= cons_mz + mz_tol))
|
|
419
410
|
|
|
420
411
|
initial_match_count = len(matches)
|
|
421
412
|
|
|
@@ -423,14 +414,11 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
423
414
|
if rt_tol is not None and cons_rt is not None and not matches.is_empty():
|
|
424
415
|
# First, check if any m/z matches have RT data
|
|
425
416
|
rt_candidates = matches.filter(pl.col("rt").is_not_null())
|
|
426
|
-
|
|
417
|
+
|
|
427
418
|
if not rt_candidates.is_empty():
|
|
428
419
|
# Apply RT filtering to candidates with RT data
|
|
429
|
-
rt_matches = rt_candidates.filter(
|
|
430
|
-
|
|
431
|
-
(pl.col("rt") <= cons_rt + rt_tol)
|
|
432
|
-
)
|
|
433
|
-
|
|
420
|
+
rt_matches = rt_candidates.filter((pl.col("rt") >= cons_rt - rt_tol) & (pl.col("rt") <= cons_rt + rt_tol))
|
|
421
|
+
|
|
434
422
|
if not rt_matches.is_empty():
|
|
435
423
|
matches = rt_matches
|
|
436
424
|
if logger:
|
|
@@ -458,12 +446,14 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
458
446
|
strict_matches = matches.filter(
|
|
459
447
|
(pl.col("mz") >= cons_mz - strict_mz_tol) & (pl.col("mz") <= cons_mz + strict_mz_tol)
|
|
460
448
|
)
|
|
461
|
-
|
|
449
|
+
|
|
462
450
|
if not strict_matches.is_empty():
|
|
463
451
|
# Use strict matches if available
|
|
464
452
|
matches = strict_matches
|
|
465
453
|
if logger:
|
|
466
|
-
logger.debug(
|
|
454
|
+
logger.debug(
|
|
455
|
+
f"Consensus {cons_uid}: Using {len(matches)} strict m/z matches (within {strict_mz_tol:.6f} Da)"
|
|
456
|
+
)
|
|
467
457
|
else:
|
|
468
458
|
if logger:
|
|
469
459
|
logger.debug(f"Consensus {cons_uid}: No strict matches, using {len(matches)} loose matches")
|
|
@@ -472,21 +462,18 @@ def _find_matches_vectorized(lib_df, cons_mz, cons_rt, mz_tol, rt_tol, logger, c
|
|
|
472
462
|
if not matches.is_empty() and len(matches) > 1:
|
|
473
463
|
if "formula" in matches.columns and "adduct" in matches.columns:
|
|
474
464
|
pre_dedup_count = len(matches)
|
|
475
|
-
|
|
465
|
+
|
|
476
466
|
# Calculate m/z error for sorting
|
|
477
|
-
matches = matches.with_columns([
|
|
478
|
-
|
|
479
|
-
])
|
|
480
|
-
|
|
467
|
+
matches = matches.with_columns([(pl.col("mz") - cons_mz).abs().alias("mz_error_abs")])
|
|
468
|
+
|
|
481
469
|
# Group by formula and adduct, but keep the most accurate m/z match
|
|
482
470
|
matches = (
|
|
483
|
-
matches
|
|
484
|
-
.sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
|
|
471
|
+
matches.sort(["mz_error_abs", "lib_uid"]) # Sort by m/z accuracy first, then lib_uid for consistency
|
|
485
472
|
.group_by(["formula", "adduct"], maintain_order=True)
|
|
486
473
|
.first()
|
|
487
474
|
.drop("mz_error_abs") # Remove the temporary column
|
|
488
475
|
)
|
|
489
|
-
|
|
476
|
+
|
|
490
477
|
post_dedup_count = len(matches)
|
|
491
478
|
if logger and post_dedup_count < pre_dedup_count:
|
|
492
479
|
logger.debug(
|
|
@@ -512,10 +499,10 @@ def _update_identification_results(study, results, logger):
|
|
|
512
499
|
"score": match["score"],
|
|
513
500
|
"iso": 0, # Default to zero
|
|
514
501
|
})
|
|
515
|
-
|
|
502
|
+
|
|
516
503
|
# Convert to DataFrame and append to existing results
|
|
517
504
|
new_results_df = pl.DataFrame(records) if records else pl.DataFrame()
|
|
518
|
-
|
|
505
|
+
|
|
519
506
|
if not new_results_df.is_empty():
|
|
520
507
|
if hasattr(study, "id_df") and study.id_df is not None and not study.id_df.is_empty():
|
|
521
508
|
# Check if existing id_df has the iso column
|
|
@@ -524,11 +511,11 @@ def _update_identification_results(study, results, logger):
|
|
|
524
511
|
study.id_df = study.id_df.with_columns(pl.lit(0).alias("iso"))
|
|
525
512
|
if logger:
|
|
526
513
|
logger.debug("Added 'iso' column to existing id_df for schema compatibility")
|
|
527
|
-
|
|
514
|
+
|
|
528
515
|
study.id_df = pl.concat([study.id_df, new_results_df])
|
|
529
516
|
else:
|
|
530
517
|
study.id_df = new_results_df
|
|
531
|
-
|
|
518
|
+
|
|
532
519
|
if logger:
|
|
533
520
|
logger.debug(f"Added {len(records)} identification results to study.id_df")
|
|
534
521
|
elif not hasattr(study, "id_df"):
|
|
@@ -539,7 +526,7 @@ def _finalize_identification_results(study, params, logger):
|
|
|
539
526
|
"""Apply final scoring adjustments and update consensus columns."""
|
|
540
527
|
# Apply scoring adjustments based on compound and formula counts
|
|
541
528
|
_apply_scoring_adjustments(study, params)
|
|
542
|
-
|
|
529
|
+
|
|
543
530
|
# Update consensus_df with top-scoring identification results
|
|
544
531
|
_update_consensus_id_columns(study, logger)
|
|
545
532
|
|
|
@@ -568,7 +555,7 @@ def _validate_identify_inputs(study, logger=None):
|
|
|
568
555
|
if logger:
|
|
569
556
|
logger.error("Library (study.lib_df) is empty; call lib_load() first")
|
|
570
557
|
raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
|
|
571
|
-
|
|
558
|
+
|
|
572
559
|
return True
|
|
573
560
|
|
|
574
561
|
|
|
@@ -612,8 +599,6 @@ def _prepare_consensus_features(study, features, logger=None):
|
|
|
612
599
|
return consensus_to_process, target_uids
|
|
613
600
|
|
|
614
601
|
|
|
615
|
-
|
|
616
|
-
|
|
617
602
|
def _get_adduct_probabilities(study):
|
|
618
603
|
"""Get adduct probabilities from _get_adducts() results."""
|
|
619
604
|
adducts_df = _get_adducts(study)
|
|
@@ -624,45 +609,42 @@ def _get_adduct_probabilities(study):
|
|
|
624
609
|
return adduct_prob_map
|
|
625
610
|
|
|
626
611
|
|
|
627
|
-
|
|
628
|
-
|
|
612
|
+
def _create_identification_results(
|
|
613
|
+
consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger=None
|
|
614
|
+
):
|
|
629
615
|
"""Create identification results by matching consensus features against library (DEPRECATED - use optimized version)."""
|
|
630
616
|
# This function is now deprecated in favor of _perform_identification_matching
|
|
631
617
|
# Keep for backward compatibility but redirect to optimized version
|
|
632
618
|
results = _perform_identification_matching(
|
|
633
619
|
consensus_to_process, study, effective_mz_tol, effective_rt_tol, adduct_prob_map, logger
|
|
634
620
|
)
|
|
635
|
-
|
|
621
|
+
|
|
636
622
|
# Convert to legacy format for compatibility
|
|
637
623
|
legacy_results = []
|
|
638
624
|
features_with_matches = 0
|
|
639
625
|
total_matches = 0
|
|
640
|
-
|
|
626
|
+
|
|
641
627
|
for result in results:
|
|
642
628
|
if result["matches"]:
|
|
643
629
|
features_with_matches += 1
|
|
644
630
|
total_matches += len(result["matches"])
|
|
645
|
-
|
|
631
|
+
|
|
646
632
|
for match in result["matches"]:
|
|
647
633
|
legacy_results.append({
|
|
648
634
|
"consensus_uid": result["consensus_uid"],
|
|
649
635
|
"lib_uid": match["lib_uid"],
|
|
650
|
-
"mz_delta": match["mz_delta"],
|
|
636
|
+
"mz_delta": match["mz_delta"],
|
|
651
637
|
"rt_delta": match["rt_delta"],
|
|
652
638
|
"matcher": match["matcher"],
|
|
653
639
|
"score": match["score"],
|
|
654
640
|
})
|
|
655
|
-
|
|
641
|
+
|
|
656
642
|
return legacy_results, features_with_matches, total_matches
|
|
657
643
|
|
|
658
644
|
|
|
659
645
|
def _apply_scoring_adjustments(study, params):
|
|
660
646
|
"""Apply scoring adjustments based on compound and formula counts using optimized operations."""
|
|
661
|
-
if (
|
|
662
|
-
not study.id_df.is_empty()
|
|
663
|
-
and hasattr(study, "lib_df")
|
|
664
|
-
and not study.lib_df.is_empty()
|
|
665
|
-
):
|
|
647
|
+
if not study.id_df.is_empty() and hasattr(study, "lib_df") and not study.lib_df.is_empty():
|
|
666
648
|
# Get penalty parameters
|
|
667
649
|
heteroatoms = getattr(params, "heteroatoms", ["Cl", "Br", "F", "I"])
|
|
668
650
|
heteroatom_penalty = getattr(params, "heteroatom_penalty", 0.7)
|
|
@@ -685,15 +667,14 @@ def _apply_scoring_adjustments(study, params):
|
|
|
685
667
|
|
|
686
668
|
# Join stats back and apply all penalties in one with_columns operation
|
|
687
669
|
heteroatom_conditions = [pl.col("formula").str.contains(atom) for atom in heteroatoms]
|
|
688
|
-
has_heteroatoms =
|
|
689
|
-
acc=pl.lit(False),
|
|
690
|
-
|
|
691
|
-
|
|
692
|
-
)
|
|
670
|
+
has_heteroatoms = (
|
|
671
|
+
pl.fold(acc=pl.lit(False), function=lambda acc, x: acc | x, exprs=heteroatom_conditions)
|
|
672
|
+
if heteroatom_conditions
|
|
673
|
+
else pl.lit(False)
|
|
674
|
+
)
|
|
693
675
|
|
|
694
676
|
study.id_df = (
|
|
695
|
-
id_with_lib
|
|
696
|
-
.join(stats, on="consensus_uid", how="left")
|
|
677
|
+
id_with_lib.join(stats, on="consensus_uid", how="left")
|
|
697
678
|
.with_columns([
|
|
698
679
|
# Apply all penalties in sequence using case-when chains
|
|
699
680
|
pl.when(pl.col("formula").is_not_null() & has_heteroatoms)
|
|
@@ -716,7 +697,7 @@ def _apply_scoring_adjustments(study, params):
|
|
|
716
697
|
])
|
|
717
698
|
.select([
|
|
718
699
|
"consensus_uid",
|
|
719
|
-
"lib_uid",
|
|
700
|
+
"lib_uid",
|
|
720
701
|
"mz_delta",
|
|
721
702
|
"rt_delta",
|
|
722
703
|
"matcher",
|
|
@@ -728,7 +709,7 @@ def _apply_scoring_adjustments(study, params):
|
|
|
728
709
|
def _update_consensus_id_columns(study, logger=None):
|
|
729
710
|
"""
|
|
730
711
|
Update consensus_df with top-scoring identification results using safe in-place updates.
|
|
731
|
-
|
|
712
|
+
|
|
732
713
|
FIXED VERSION: Prevents same compound from being assigned to vastly different m/z values.
|
|
733
714
|
"""
|
|
734
715
|
try:
|
|
@@ -736,15 +717,15 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
736
717
|
if logger:
|
|
737
718
|
logger.debug("No identification results to process")
|
|
738
719
|
return
|
|
739
|
-
|
|
720
|
+
|
|
740
721
|
if not hasattr(study, "lib_df") or study.lib_df is None or study.lib_df.is_empty():
|
|
741
722
|
if logger:
|
|
742
723
|
logger.debug("No library data available")
|
|
743
724
|
return
|
|
744
|
-
|
|
725
|
+
|
|
745
726
|
if not hasattr(study, "consensus_df") or study.consensus_df is None or study.consensus_df.is_empty():
|
|
746
727
|
if logger:
|
|
747
|
-
logger.debug("No consensus data available")
|
|
728
|
+
logger.debug("No consensus data available")
|
|
748
729
|
return
|
|
749
730
|
|
|
750
731
|
# Get library columns we need (include mz for validation)
|
|
@@ -754,50 +735,45 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
754
735
|
|
|
755
736
|
# FIX 1: Join identification results with consensus m/z for validation
|
|
756
737
|
id_with_consensus = study.id_df.join(
|
|
757
|
-
study.consensus_df.select(["consensus_uid", "mz"]),
|
|
758
|
-
on="consensus_uid",
|
|
759
|
-
how="left",
|
|
760
|
-
suffix="_consensus"
|
|
738
|
+
study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left", suffix="_consensus"
|
|
761
739
|
)
|
|
762
740
|
|
|
763
741
|
# FIX 2: Validate m/z accuracy - filter out poor matches
|
|
764
742
|
id_with_lib = id_with_consensus.join(
|
|
765
|
-
study.lib_df.select(["lib_uid", "mz"]),
|
|
766
|
-
on="lib_uid",
|
|
767
|
-
how="left",
|
|
768
|
-
suffix="_lib"
|
|
743
|
+
study.lib_df.select(["lib_uid", "mz"]), on="lib_uid", how="left", suffix="_lib"
|
|
769
744
|
)
|
|
770
|
-
|
|
745
|
+
|
|
771
746
|
# Calculate actual m/z error and filter out excessive errors
|
|
772
|
-
id_validated = id_with_lib.with_columns([
|
|
773
|
-
|
|
774
|
-
])
|
|
775
|
-
|
|
747
|
+
id_validated = id_with_lib.with_columns([(pl.col("mz") - pl.col("mz_lib")).abs().alias("actual_mz_error")])
|
|
748
|
+
|
|
776
749
|
# Filter out matches with excessive m/z error
|
|
777
750
|
max_reasonable_error = 0.02 # 20 millidalton maximum error
|
|
778
751
|
id_validated = id_validated.filter(
|
|
779
752
|
(pl.col("actual_mz_error") <= max_reasonable_error) | pl.col("actual_mz_error").is_null()
|
|
780
753
|
)
|
|
781
|
-
|
|
754
|
+
|
|
782
755
|
if logger:
|
|
783
756
|
original_count = len(id_with_consensus)
|
|
784
757
|
validated_count = len(id_validated)
|
|
785
758
|
if validated_count < original_count:
|
|
786
|
-
logger.warning(
|
|
759
|
+
logger.warning(
|
|
760
|
+
f"Filtered out {original_count - validated_count} identifications with excessive m/z error (>{max_reasonable_error:.3f} Da)"
|
|
761
|
+
)
|
|
787
762
|
|
|
788
763
|
# Get top-scoring identification for each consensus feature (from validated results)
|
|
789
764
|
top_ids = (
|
|
790
|
-
id_validated
|
|
791
|
-
.sort(["consensus_uid", "score"], descending=[False, True])
|
|
765
|
+
id_validated.sort(["consensus_uid", "score"], descending=[False, True])
|
|
792
766
|
.group_by("consensus_uid", maintain_order=True)
|
|
793
767
|
.first()
|
|
794
768
|
.join(study.lib_df.select(lib_columns), on="lib_uid", how="left")
|
|
795
769
|
.select([
|
|
796
770
|
"consensus_uid",
|
|
797
771
|
"name",
|
|
798
|
-
pl.col("class").alias("id_top_class")
|
|
772
|
+
pl.col("class").alias("id_top_class")
|
|
773
|
+
if "class" in lib_columns
|
|
774
|
+
else pl.lit(None, dtype=pl.String).alias("id_top_class"),
|
|
799
775
|
pl.col("adduct").alias("id_top_adduct"),
|
|
800
|
-
pl.col("score").alias("id_top_score")
|
|
776
|
+
pl.col("score").alias("id_top_score"),
|
|
801
777
|
])
|
|
802
778
|
.rename({"name": "id_top_name"})
|
|
803
779
|
)
|
|
@@ -805,28 +781,23 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
805
781
|
# FIX 3: Check for conflicts where same compound+adduct assigned to very different m/z
|
|
806
782
|
if not top_ids.is_empty():
|
|
807
783
|
compound_groups = (
|
|
808
|
-
top_ids
|
|
809
|
-
.join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
|
|
784
|
+
top_ids.join(study.consensus_df.select(["consensus_uid", "mz"]), on="consensus_uid", how="left")
|
|
810
785
|
.group_by(["id_top_name", "id_top_adduct"])
|
|
811
786
|
.agg([
|
|
812
787
|
pl.col("consensus_uid").count().alias("count"),
|
|
813
788
|
pl.col("mz").min().alias("mz_min"),
|
|
814
|
-
pl.col("mz").max().alias("mz_max")
|
|
815
|
-
])
|
|
816
|
-
.with_columns([
|
|
817
|
-
(pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")
|
|
789
|
+
pl.col("mz").max().alias("mz_max"),
|
|
818
790
|
])
|
|
791
|
+
.with_columns([(pl.col("mz_max") - pl.col("mz_min")).alias("mz_range")])
|
|
819
792
|
)
|
|
820
|
-
|
|
793
|
+
|
|
821
794
|
# Find problematic assignments (same compound+adduct with >0.1 Da m/z range)
|
|
822
|
-
problematic = compound_groups.filter(
|
|
823
|
-
|
|
824
|
-
)
|
|
825
|
-
|
|
795
|
+
problematic = compound_groups.filter((pl.col("count") > 1) & (pl.col("mz_range") > 0.1))
|
|
796
|
+
|
|
826
797
|
if not problematic.is_empty() and logger:
|
|
827
798
|
for row in problematic.iter_rows(named=True):
|
|
828
799
|
name = row["id_top_name"]
|
|
829
|
-
adduct = row["id_top_adduct"]
|
|
800
|
+
adduct = row["id_top_adduct"]
|
|
830
801
|
count = row["count"]
|
|
831
802
|
mz_range = row["mz_range"]
|
|
832
803
|
logger.warning(
|
|
@@ -836,15 +807,13 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
836
807
|
# Ensure we have the id_top columns in consensus_df
|
|
837
808
|
for col_name, dtype in [
|
|
838
809
|
("id_top_name", pl.String),
|
|
839
|
-
("id_top_class", pl.String),
|
|
810
|
+
("id_top_class", pl.String),
|
|
840
811
|
("id_top_adduct", pl.String),
|
|
841
812
|
("id_top_score", pl.Float64),
|
|
842
|
-
("id_source", pl.String)
|
|
813
|
+
("id_source", pl.String),
|
|
843
814
|
]:
|
|
844
815
|
if col_name not in study.consensus_df.columns:
|
|
845
|
-
study.consensus_df = study.consensus_df.with_columns(
|
|
846
|
-
pl.lit(None, dtype=dtype).alias(col_name)
|
|
847
|
-
)
|
|
816
|
+
study.consensus_df = study.consensus_df.with_columns(pl.lit(None, dtype=dtype).alias(col_name))
|
|
848
817
|
|
|
849
818
|
# Create a mapping dictionary for efficient updates
|
|
850
819
|
id_mapping = {}
|
|
@@ -854,42 +823,36 @@ def _update_consensus_id_columns(study, logger=None):
|
|
|
854
823
|
"id_top_name": row["id_top_name"],
|
|
855
824
|
"id_top_class": row["id_top_class"],
|
|
856
825
|
"id_top_adduct": row["id_top_adduct"],
|
|
857
|
-
"id_top_score": row["id_top_score"]
|
|
826
|
+
"id_top_score": row["id_top_score"],
|
|
858
827
|
}
|
|
859
828
|
|
|
860
829
|
# Update consensus_df using map_elements (safer than join for avoiding duplicates)
|
|
861
830
|
if id_mapping:
|
|
862
831
|
study.consensus_df = study.consensus_df.with_columns([
|
|
863
|
-
pl.col("consensus_uid")
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
|
|
867
|
-
|
|
868
|
-
|
|
869
|
-
|
|
870
|
-
).
|
|
871
|
-
|
|
872
|
-
|
|
873
|
-
|
|
874
|
-
|
|
875
|
-
pl.col("consensus_uid").map_elements(
|
|
876
|
-
lambda uid: id_mapping.get(uid, {}).get("id_top_score"),
|
|
877
|
-
return_dtype=pl.Float64
|
|
878
|
-
).alias("id_top_score")
|
|
832
|
+
pl.col("consensus_uid")
|
|
833
|
+
.map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_name"), return_dtype=pl.String)
|
|
834
|
+
.alias("id_top_name"),
|
|
835
|
+
pl.col("consensus_uid")
|
|
836
|
+
.map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_class"), return_dtype=pl.String)
|
|
837
|
+
.alias("id_top_class"),
|
|
838
|
+
pl.col("consensus_uid")
|
|
839
|
+
.map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_adduct"), return_dtype=pl.String)
|
|
840
|
+
.alias("id_top_adduct"),
|
|
841
|
+
pl.col("consensus_uid")
|
|
842
|
+
.map_elements(lambda uid: id_mapping.get(uid, {}).get("id_top_score"), return_dtype=pl.Float64)
|
|
843
|
+
.alias("id_top_score"),
|
|
879
844
|
])
|
|
880
845
|
|
|
881
846
|
if logger:
|
|
882
847
|
num_updated = len(id_mapping)
|
|
883
848
|
logger.debug(f"Updated consensus_df with top identifications for {num_updated} features")
|
|
884
|
-
|
|
849
|
+
|
|
885
850
|
except Exception as e:
|
|
886
851
|
if logger:
|
|
887
852
|
logger.error(f"Error updating consensus_df with identification results: {e}")
|
|
888
853
|
# Don't re-raise to avoid breaking the identification process
|
|
889
854
|
|
|
890
855
|
|
|
891
|
-
|
|
892
|
-
|
|
893
856
|
def identify(study, features=None, params=None, **kwargs):
|
|
894
857
|
"""Identify consensus features against the loaded library.
|
|
895
858
|
|
|
@@ -915,12 +878,12 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
915
878
|
"""
|
|
916
879
|
# Get logger from study if available
|
|
917
880
|
logger = getattr(study, "logger", None)
|
|
918
|
-
|
|
881
|
+
|
|
919
882
|
# Setup parameters early
|
|
920
883
|
params = _setup_identify_parameters(params, kwargs)
|
|
921
884
|
effective_mz_tol = getattr(params, "mz_tol", 0.01)
|
|
922
885
|
effective_rt_tol = getattr(params, "rt_tol", 2.0)
|
|
923
|
-
|
|
886
|
+
|
|
924
887
|
if logger:
|
|
925
888
|
logger.debug(
|
|
926
889
|
f"Starting identification with mz_tolerance={effective_mz_tol}, rt_tolerance={effective_rt_tol}",
|
|
@@ -937,7 +900,7 @@ def identify(study, features=None, params=None, **kwargs):
|
|
|
937
900
|
|
|
938
901
|
# Smart reset of id_df: only clear results for features being re-identified
|
|
939
902
|
_smart_reset_id_results(study, target_uids, logger)
|
|
940
|
-
|
|
903
|
+
|
|
941
904
|
# Cache adduct probabilities (expensive operation)
|
|
942
905
|
adduct_prob_map = _get_cached_adduct_probabilities(study, logger)
|
|
943
906
|
|
|
@@ -1037,9 +1000,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
1037
1000
|
# Join with consensus_df to get consensus feature m/z and RT
|
|
1038
1001
|
consensus_cols = ["consensus_uid", "mz", "rt"]
|
|
1039
1002
|
# Only select columns that exist in consensus_df
|
|
1040
|
-
available_consensus_cols = [
|
|
1041
|
-
col for col in consensus_cols if col in study.consensus_df.columns
|
|
1042
|
-
]
|
|
1003
|
+
available_consensus_cols = [col for col in consensus_cols if col in study.consensus_df.columns]
|
|
1043
1004
|
|
|
1044
1005
|
result_df = result_df.join(
|
|
1045
1006
|
study.consensus_df.select(available_consensus_cols),
|
|
@@ -1101,9 +1062,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
1101
1062
|
column_order.extend(remaining_cols)
|
|
1102
1063
|
|
|
1103
1064
|
# Filter out None values and select existing columns
|
|
1104
|
-
final_column_order = [
|
|
1105
|
-
col for col in column_order if col is not None and col in result_df.columns
|
|
1106
|
-
]
|
|
1065
|
+
final_column_order = [col for col in column_order if col is not None and col in result_df.columns]
|
|
1107
1066
|
|
|
1108
1067
|
result_df = result_df.select(final_column_order)
|
|
1109
1068
|
|
|
@@ -1115,10 +1074,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
1115
1074
|
pl.col("cmpd_uid").n_unique().alias("num_cmpds")
|
|
1116
1075
|
if "cmpd_uid" in result_df.columns
|
|
1117
1076
|
else pl.lit(None).alias("num_cmpds"),
|
|
1118
|
-
pl.col("formula")
|
|
1119
|
-
.filter(pl.col("formula").is_not_null())
|
|
1120
|
-
.n_unique()
|
|
1121
|
-
.alias("num_formulas")
|
|
1077
|
+
pl.col("formula").filter(pl.col("formula").is_not_null()).n_unique().alias("num_formulas")
|
|
1122
1078
|
if "formula" in result_df.columns
|
|
1123
1079
|
else pl.lit(None).alias("num_formulas"),
|
|
1124
1080
|
],
|
|
@@ -1177,9 +1133,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
1177
1133
|
|
|
1178
1134
|
# Get the highest scoring entry's RT as reference
|
|
1179
1135
|
reference_rt = (
|
|
1180
|
-
group_df["rt"][0]
|
|
1181
|
-
if "rt" in group_df.columns and group_df["rt"][0] is not None
|
|
1182
|
-
else None
|
|
1136
|
+
group_df["rt"][0] if "rt" in group_df.columns and group_df["rt"][0] is not None else None
|
|
1183
1137
|
)
|
|
1184
1138
|
|
|
1185
1139
|
# Filter entries: keep those with same RT as highest scoring entry
|
|
@@ -1193,11 +1147,7 @@ def get_id(study, features=None) -> pl.DataFrame:
|
|
|
1193
1147
|
rt_filtered = group_df
|
|
1194
1148
|
|
|
1195
1149
|
# Check multiply charged constraint
|
|
1196
|
-
if (
|
|
1197
|
-
"z" in rt_filtered.columns
|
|
1198
|
-
and "adduct" in rt_filtered.columns
|
|
1199
|
-
and len(rt_filtered) > 0
|
|
1200
|
-
):
|
|
1150
|
+
if "z" in rt_filtered.columns and "adduct" in rt_filtered.columns and len(rt_filtered) > 0:
|
|
1201
1151
|
# Check if there are multiply charged adducts
|
|
1202
1152
|
multiply_charged = rt_filtered.filter(
|
|
1203
1153
|
(pl.col("z") > 1) | (pl.col("z") < -1),
|
|
@@ -1259,7 +1209,7 @@ def id_reset(study):
|
|
|
1259
1209
|
if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
|
|
1260
1210
|
if logger:
|
|
1261
1211
|
logger.debug("Resetting id_top_* columns in consensus_df")
|
|
1262
|
-
|
|
1212
|
+
|
|
1263
1213
|
# Check which columns exist before trying to update them
|
|
1264
1214
|
id_columns_to_reset = []
|
|
1265
1215
|
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
|
|
@@ -1268,7 +1218,7 @@ def id_reset(study):
|
|
|
1268
1218
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
|
|
1269
1219
|
else:
|
|
1270
1220
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
|
|
1271
|
-
|
|
1221
|
+
|
|
1272
1222
|
if id_columns_to_reset:
|
|
1273
1223
|
study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
|
|
1274
1224
|
|
|
@@ -1306,24 +1256,24 @@ def lib_reset(study):
|
|
|
1306
1256
|
if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
|
|
1307
1257
|
if logger:
|
|
1308
1258
|
logger.debug("Checking for consensus features created by lib_to_consensus()")
|
|
1309
|
-
|
|
1259
|
+
|
|
1310
1260
|
try:
|
|
1311
1261
|
# Filter for features created by lib_to_consensus()
|
|
1312
1262
|
# These can be identified by:
|
|
1313
1263
|
# 1. number_samples < 1 (set to 0.0 by lib_to_consensus)
|
|
1314
1264
|
# 2. AND have corresponding entries in consensus_mapping_df with sample_uid = 0 (virtual sample)
|
|
1315
|
-
|
|
1265
|
+
|
|
1316
1266
|
# First check if we have any features with number_samples < 1
|
|
1317
1267
|
potential_lib_features = study.consensus_df.filter(pl.col("number_samples") < 1)
|
|
1318
|
-
|
|
1268
|
+
|
|
1319
1269
|
if potential_lib_features is not None and not potential_lib_features.is_empty():
|
|
1320
1270
|
# Further filter by checking if they have sample_uid = 0 in consensus_mapping_df
|
|
1321
1271
|
# This ensures we only remove library-derived features, not legitimate features with 0 samples
|
|
1322
1272
|
if hasattr(study, "consensus_mapping_df") and not study.consensus_mapping_df.is_empty():
|
|
1323
|
-
lib_consensus_uids =
|
|
1324
|
-
pl.col("sample_uid") == 0
|
|
1325
|
-
)
|
|
1326
|
-
|
|
1273
|
+
lib_consensus_uids = (
|
|
1274
|
+
study.consensus_mapping_df.filter(pl.col("sample_uid") == 0)["consensus_uid"].unique().to_list()
|
|
1275
|
+
)
|
|
1276
|
+
|
|
1327
1277
|
if lib_consensus_uids:
|
|
1328
1278
|
lib_consensus_features = potential_lib_features.filter(
|
|
1329
1279
|
pl.col("consensus_uid").is_in(lib_consensus_uids)
|
|
@@ -1335,15 +1285,15 @@ def lib_reset(study):
|
|
|
1335
1285
|
lib_consensus_features = potential_lib_features
|
|
1336
1286
|
else:
|
|
1337
1287
|
lib_consensus_features = pl.DataFrame() # No features with number_samples < 1
|
|
1338
|
-
|
|
1288
|
+
|
|
1339
1289
|
if lib_consensus_features is not None and not lib_consensus_features.is_empty():
|
|
1340
1290
|
num_lib_features = len(lib_consensus_features)
|
|
1341
1291
|
if logger:
|
|
1342
1292
|
logger.info(f"Removing {num_lib_features} consensus features created by lib_to_consensus()")
|
|
1343
|
-
|
|
1293
|
+
|
|
1344
1294
|
# Use consensus_delete to remove these features and all dependent data
|
|
1345
1295
|
study.consensus_delete(lib_consensus_features)
|
|
1346
|
-
|
|
1296
|
+
|
|
1347
1297
|
if logger:
|
|
1348
1298
|
logger.debug("Successfully removed library-derived consensus features")
|
|
1349
1299
|
else:
|
|
@@ -1375,7 +1325,7 @@ def lib_reset(study):
|
|
|
1375
1325
|
if hasattr(study, "consensus_df") and not study.consensus_df.is_empty():
|
|
1376
1326
|
if logger:
|
|
1377
1327
|
logger.debug("Resetting id_top_* columns in consensus_df")
|
|
1378
|
-
|
|
1328
|
+
|
|
1379
1329
|
# Check which columns exist before trying to update them
|
|
1380
1330
|
id_columns_to_reset = []
|
|
1381
1331
|
for col in ["id_top_name", "id_top_class", "id_top_adduct", "id_top_score", "id_source"]:
|
|
@@ -1384,7 +1334,7 @@ def lib_reset(study):
|
|
|
1384
1334
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.Float64).alias(col))
|
|
1385
1335
|
else:
|
|
1386
1336
|
id_columns_to_reset.append(pl.lit(None, dtype=pl.String).alias(col))
|
|
1387
|
-
|
|
1337
|
+
|
|
1388
1338
|
if id_columns_to_reset:
|
|
1389
1339
|
study.consensus_df = study.consensus_df.with_columns(id_columns_to_reset)
|
|
1390
1340
|
|
|
@@ -1399,7 +1349,7 @@ def lib_reset(study):
|
|
|
1399
1349
|
if logger:
|
|
1400
1350
|
logger.debug("Removing 'lib_load' from history")
|
|
1401
1351
|
del study.history["lib_load"]
|
|
1402
|
-
|
|
1352
|
+
|
|
1403
1353
|
if "lib_to_consensus" in study.history:
|
|
1404
1354
|
if logger:
|
|
1405
1355
|
logger.debug("Removing 'lib_to_consensus' from history")
|
|
@@ -1445,9 +1395,7 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
|
1445
1395
|
adducts_list_to_use = adducts_list
|
|
1446
1396
|
if adducts_list_to_use is None:
|
|
1447
1397
|
adducts_list_to_use = (
|
|
1448
|
-
study.parameters.adducts
|
|
1449
|
-
if hasattr(study.parameters, "adducts") and study.parameters.adducts
|
|
1450
|
-
else []
|
|
1398
|
+
study.parameters.adducts if hasattr(study.parameters, "adducts") and study.parameters.adducts else []
|
|
1451
1399
|
)
|
|
1452
1400
|
|
|
1453
1401
|
# Get parameters with study-specific defaults
|
|
@@ -1561,11 +1509,9 @@ def _get_adducts(study, adducts_list: list | None = None, **kwargs):
|
|
|
1561
1509
|
{
|
|
1562
1510
|
"components": components,
|
|
1563
1511
|
"formatted_name": formatted_name,
|
|
1564
|
-
"total_mass_shift": float(pos_spec["mass_shift"])
|
|
1565
|
-
+ float(neut_spec["mass_shift"]),
|
|
1512
|
+
"total_mass_shift": float(pos_spec["mass_shift"]) + float(neut_spec["mass_shift"]),
|
|
1566
1513
|
"total_charge": total_charge,
|
|
1567
|
-
"combined_probability": float(pos_spec["probability"])
|
|
1568
|
-
* float(neut_spec["probability"]),
|
|
1514
|
+
"combined_probability": float(pos_spec["probability"]) * float(neut_spec["probability"]),
|
|
1569
1515
|
"complexity": 2,
|
|
1570
1516
|
},
|
|
1571
1517
|
)
|
|
@@ -1739,9 +1685,7 @@ def _format_adduct_name(components: list[dict]) -> str:
|
|
|
1739
1685
|
elif abs(total_charge) == 1:
|
|
1740
1686
|
charge_str = "1+" if total_charge > 0 else "1-"
|
|
1741
1687
|
else:
|
|
1742
|
-
charge_str = (
|
|
1743
|
-
f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
1744
|
-
)
|
|
1688
|
+
charge_str = f"{abs(total_charge)}+" if total_charge > 0 else f"{abs(total_charge)}-"
|
|
1745
1689
|
|
|
1746
1690
|
return f"[M{formula}]{charge_str}"
|
|
1747
1691
|
|
|
@@ -1749,53 +1693,53 @@ def _format_adduct_name(components: list[dict]) -> str:
|
|
|
1749
1693
|
def _generate_13c_isotopes(lib_df):
|
|
1750
1694
|
"""
|
|
1751
1695
|
Generate 13C isotope variants for library entries.
|
|
1752
|
-
|
|
1696
|
+
|
|
1753
1697
|
For each compound with n carbon atoms, creates n+1 entries:
|
|
1754
1698
|
- iso=0: original compound (no 13C)
|
|
1755
1699
|
- iso=1: one 13C isotope (+1.00335 Da)
|
|
1756
1700
|
- iso=2: two 13C isotopes (+2.00670 Da)
|
|
1757
1701
|
- ...
|
|
1758
1702
|
- iso=n: n 13C isotopes (+n*1.00335 Da)
|
|
1759
|
-
|
|
1703
|
+
|
|
1760
1704
|
All isotopomers share the same quant_group.
|
|
1761
|
-
|
|
1705
|
+
|
|
1762
1706
|
Args:
|
|
1763
1707
|
lib_df: Polars DataFrame with library entries
|
|
1764
|
-
|
|
1708
|
+
|
|
1765
1709
|
Returns:
|
|
1766
1710
|
Polars DataFrame with additional 13C isotope entries
|
|
1767
1711
|
"""
|
|
1768
1712
|
if lib_df.is_empty():
|
|
1769
1713
|
return lib_df
|
|
1770
|
-
|
|
1714
|
+
|
|
1771
1715
|
# First, ensure all original entries have iso=0
|
|
1772
1716
|
original_df = lib_df.with_columns(pl.lit(0).alias("iso"))
|
|
1773
|
-
|
|
1717
|
+
|
|
1774
1718
|
isotope_entries = []
|
|
1775
1719
|
next_lib_uid = lib_df["lib_uid"].max() + 1 if len(lib_df) > 0 else 1
|
|
1776
|
-
|
|
1720
|
+
|
|
1777
1721
|
# Mass difference for one 13C isotope
|
|
1778
1722
|
c13_mass_shift = 1.00335 # Mass difference between 13C and 12C
|
|
1779
|
-
|
|
1723
|
+
|
|
1780
1724
|
for row in original_df.iter_rows(named=True):
|
|
1781
1725
|
formula = row.get("formula", "")
|
|
1782
1726
|
if not formula:
|
|
1783
1727
|
continue
|
|
1784
|
-
|
|
1728
|
+
|
|
1785
1729
|
# Count carbon atoms in the formula
|
|
1786
1730
|
carbon_count = _count_carbon_atoms(formula)
|
|
1787
1731
|
if carbon_count == 0:
|
|
1788
1732
|
continue
|
|
1789
|
-
|
|
1733
|
+
|
|
1790
1734
|
# Get the original quant_group to keep it consistent across isotopes
|
|
1791
1735
|
# All isotopomers of the same compound should have the same quant_group
|
|
1792
1736
|
quant_group = row.get("quant_group", row.get("cmpd_uid", row.get("lib_uid", 1)))
|
|
1793
|
-
|
|
1737
|
+
|
|
1794
1738
|
# Generate isotope variants (1 to n 13C atoms)
|
|
1795
1739
|
for iso_num in range(1, carbon_count + 1):
|
|
1796
1740
|
# Calculate mass shift for this number of 13C isotopes
|
|
1797
1741
|
mass_shift = iso_num * c13_mass_shift
|
|
1798
|
-
|
|
1742
|
+
|
|
1799
1743
|
# Create new entry
|
|
1800
1744
|
isotope_entry = dict(row) # Copy all fields
|
|
1801
1745
|
isotope_entry["lib_uid"] = next_lib_uid
|
|
@@ -1803,10 +1747,10 @@ def _generate_13c_isotopes(lib_df):
|
|
|
1803
1747
|
isotope_entry["m"] = row["m"] + mass_shift
|
|
1804
1748
|
isotope_entry["mz"] = (row["m"] + mass_shift) / abs(row["z"]) if row["z"] != 0 else row["m"] + mass_shift
|
|
1805
1749
|
isotope_entry["quant_group"] = quant_group # Keep same quant_group
|
|
1806
|
-
|
|
1750
|
+
|
|
1807
1751
|
isotope_entries.append(isotope_entry)
|
|
1808
1752
|
next_lib_uid += 1
|
|
1809
|
-
|
|
1753
|
+
|
|
1810
1754
|
# Combine original entries (now with iso=0) with isotope entries
|
|
1811
1755
|
if isotope_entries:
|
|
1812
1756
|
isotope_df = pl.DataFrame(isotope_entries)
|
|
@@ -1818,7 +1762,7 @@ def _generate_13c_isotopes(lib_df):
|
|
|
1818
1762
|
# Get common schema
|
|
1819
1763
|
original_schema = original_df.schema
|
|
1820
1764
|
isotope_schema = isotope_df.schema
|
|
1821
|
-
|
|
1765
|
+
|
|
1822
1766
|
# Cast isotope_df columns to match original_df schema where possible
|
|
1823
1767
|
cast_exprs = []
|
|
1824
1768
|
for col_name in isotope_df.columns:
|
|
@@ -1827,7 +1771,7 @@ def _generate_13c_isotopes(lib_df):
|
|
|
1827
1771
|
cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
|
|
1828
1772
|
else:
|
|
1829
1773
|
cast_exprs.append(pl.col(col_name))
|
|
1830
|
-
|
|
1774
|
+
|
|
1831
1775
|
isotope_df_cast = isotope_df.select(cast_exprs)
|
|
1832
1776
|
return pl.concat([original_df, isotope_df_cast])
|
|
1833
1777
|
else:
|
|
@@ -1837,75 +1781,75 @@ def _generate_13c_isotopes(lib_df):
|
|
|
1837
1781
|
def _count_carbon_atoms(formula: str) -> int:
|
|
1838
1782
|
"""
|
|
1839
1783
|
Count the number of carbon atoms in a molecular formula.
|
|
1840
|
-
|
|
1784
|
+
|
|
1841
1785
|
Args:
|
|
1842
1786
|
formula: Molecular formula string like "C6H12O6"
|
|
1843
|
-
|
|
1787
|
+
|
|
1844
1788
|
Returns:
|
|
1845
1789
|
Number of carbon atoms
|
|
1846
1790
|
"""
|
|
1847
1791
|
import re
|
|
1848
|
-
|
|
1792
|
+
|
|
1849
1793
|
if not formula or not isinstance(formula, str):
|
|
1850
1794
|
return 0
|
|
1851
|
-
|
|
1795
|
+
|
|
1852
1796
|
# Look for carbon followed by optional number
|
|
1853
1797
|
# C followed by digits, or just C (which means 1)
|
|
1854
|
-
carbon_matches = re.findall(r
|
|
1855
|
-
|
|
1798
|
+
carbon_matches = re.findall(r"C(\d*)", formula)
|
|
1799
|
+
|
|
1856
1800
|
total_carbons = 0
|
|
1857
1801
|
for match in carbon_matches:
|
|
1858
|
-
if match ==
|
|
1802
|
+
if match == "":
|
|
1859
1803
|
# Just 'C' without number means 1 carbon
|
|
1860
1804
|
total_carbons += 1
|
|
1861
1805
|
else:
|
|
1862
1806
|
# 'C' followed by number
|
|
1863
1807
|
total_carbons += int(match)
|
|
1864
|
-
|
|
1808
|
+
|
|
1865
1809
|
return total_carbons
|
|
1866
1810
|
|
|
1867
1811
|
|
|
1868
1812
|
def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_tol: float = 2.0):
|
|
1869
1813
|
"""Create consensus features from library entries instead of features_df.
|
|
1870
|
-
|
|
1814
|
+
|
|
1871
1815
|
This method takes all rows from lib_df and creates corresponding entries in
|
|
1872
|
-
consensus_df with the same columns as merge(). Instead of relying on
|
|
1816
|
+
consensus_df with the same columns as merge(). Instead of relying on
|
|
1873
1817
|
features_df, it populates consensus features directly from library data.
|
|
1874
|
-
|
|
1818
|
+
|
|
1875
1819
|
Before creating new features, it checks for pre-existing consensus features:
|
|
1876
1820
|
- If rt in lib_df is null: picks consensus feature with matching mz and largest inty_mean
|
|
1877
1821
|
- If rt is not null: picks consensus feature with matching mz and rt within tolerance
|
|
1878
1822
|
- If a match is found, skips to the next library entry
|
|
1879
|
-
|
|
1823
|
+
|
|
1880
1824
|
Args:
|
|
1881
1825
|
study: Study instance with lib_df populated
|
|
1882
|
-
chrom_fhwm: Chromatographic full width at half maximum in seconds
|
|
1826
|
+
chrom_fhwm: Chromatographic full width at half maximum in seconds
|
|
1883
1827
|
to infer rt_start_mean and rt_end_mean (default: 5.0)
|
|
1884
1828
|
mz_tol: m/z tolerance for matching existing consensus features (default: 0.01)
|
|
1885
1829
|
rt_tol: RT tolerance for matching existing consensus features (default: 2.0)
|
|
1886
|
-
|
|
1830
|
+
|
|
1887
1831
|
Side effects:
|
|
1888
1832
|
Adds rows to study.consensus_df and study.consensus_mapping_df
|
|
1889
1833
|
Calls study.find_ms2() at the end
|
|
1890
1834
|
"""
|
|
1891
1835
|
# Get logger from study if available
|
|
1892
1836
|
logger = getattr(study, "logger", None)
|
|
1893
|
-
|
|
1837
|
+
|
|
1894
1838
|
# Validate inputs
|
|
1895
1839
|
if getattr(study, "lib_df", None) is None or study.lib_df.is_empty():
|
|
1896
1840
|
if logger:
|
|
1897
1841
|
logger.error("Library (study.lib_df) is empty; call lib_load() first")
|
|
1898
1842
|
raise ValueError("Library (study.lib_df) is empty; call lib_load() first")
|
|
1899
|
-
|
|
1843
|
+
|
|
1900
1844
|
if logger:
|
|
1901
1845
|
logger.info(f"Creating consensus features from {len(study.lib_df)} library entries")
|
|
1902
|
-
|
|
1846
|
+
|
|
1903
1847
|
# Initialize consensus DataFrames if they don't exist
|
|
1904
1848
|
if not hasattr(study, "consensus_df") or study.consensus_df is None:
|
|
1905
1849
|
study.consensus_df = pl.DataFrame()
|
|
1906
1850
|
if not hasattr(study, "consensus_mapping_df") or study.consensus_mapping_df is None:
|
|
1907
1851
|
study.consensus_mapping_df = pl.DataFrame()
|
|
1908
|
-
|
|
1852
|
+
|
|
1909
1853
|
# Get cached adducts for consistent adduct handling
|
|
1910
1854
|
cached_adducts_df = None
|
|
1911
1855
|
cached_valid_adducts = None
|
|
@@ -1919,26 +1863,26 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
1919
1863
|
if logger:
|
|
1920
1864
|
logger.warning(f"Could not retrieve study adducts: {e}")
|
|
1921
1865
|
cached_valid_adducts = set()
|
|
1922
|
-
|
|
1866
|
+
|
|
1923
1867
|
# Always allow '?' adducts
|
|
1924
1868
|
cached_valid_adducts.add("?")
|
|
1925
|
-
|
|
1869
|
+
|
|
1926
1870
|
# Get starting consensus_uid counter
|
|
1927
1871
|
if not study.consensus_df.is_empty():
|
|
1928
1872
|
max_existing_uid = study.consensus_df["consensus_uid"].max()
|
|
1929
1873
|
consensus_uid_counter = int(max_existing_uid) + 1 if max_existing_uid is not None else 0
|
|
1930
1874
|
else:
|
|
1931
1875
|
consensus_uid_counter = 0
|
|
1932
|
-
|
|
1876
|
+
|
|
1933
1877
|
# Track [M+H] iso=0 and [M-H] iso=0 entries for adduct grouping
|
|
1934
1878
|
base_adduct_groups = {} # key: (mz, adduct_base), value: adduct_group
|
|
1935
|
-
|
|
1879
|
+
|
|
1936
1880
|
# Process each library entry
|
|
1937
1881
|
consensus_metadata = []
|
|
1938
1882
|
consensus_mapping_list = []
|
|
1939
1883
|
matched_count = 0
|
|
1940
1884
|
skipped_count = 0
|
|
1941
|
-
|
|
1885
|
+
|
|
1942
1886
|
for lib_row in study.lib_df.iter_rows(named=True):
|
|
1943
1887
|
# Extract basic library data
|
|
1944
1888
|
lib_uid = lib_row.get("lib_uid")
|
|
@@ -1947,21 +1891,19 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
1947
1891
|
iso = lib_row.get("iso", 0)
|
|
1948
1892
|
adduct = lib_row.get("adduct")
|
|
1949
1893
|
z = lib_row.get("z", 1) # charge
|
|
1950
|
-
|
|
1894
|
+
|
|
1951
1895
|
# Skip entries without essential data
|
|
1952
1896
|
if mz is None:
|
|
1953
1897
|
if logger:
|
|
1954
1898
|
logger.warning(f"Skipping library entry {lib_uid} - no m/z value")
|
|
1955
1899
|
continue
|
|
1956
|
-
|
|
1900
|
+
|
|
1957
1901
|
# Check for pre-existing consensus features
|
|
1958
1902
|
existing_match = None
|
|
1959
1903
|
if not study.consensus_df.is_empty():
|
|
1960
1904
|
# Filter by m/z tolerance first
|
|
1961
|
-
mz_matches = study.consensus_df.filter(
|
|
1962
|
-
|
|
1963
|
-
)
|
|
1964
|
-
|
|
1905
|
+
mz_matches = study.consensus_df.filter((pl.col("mz") >= mz - mz_tol) & (pl.col("mz") <= mz + mz_tol))
|
|
1906
|
+
|
|
1965
1907
|
if not mz_matches.is_empty():
|
|
1966
1908
|
if rt is None:
|
|
1967
1909
|
# If rt is null, pick the consensus feature with largest inty_mean
|
|
@@ -1974,7 +1916,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
1974
1916
|
)
|
|
1975
1917
|
if not rt_matches.is_empty():
|
|
1976
1918
|
existing_match = rt_matches.sort("inty_mean", descending=True).head(1)
|
|
1977
|
-
|
|
1919
|
+
|
|
1978
1920
|
if existing_match is not None and len(existing_match) > 0:
|
|
1979
1921
|
# Found a matching consensus feature, skip this library entry
|
|
1980
1922
|
matched_count += 1
|
|
@@ -1982,27 +1924,29 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
1982
1924
|
match_uid = existing_match["consensus_uid"][0]
|
|
1983
1925
|
match_mz = existing_match["mz"][0]
|
|
1984
1926
|
match_rt = existing_match["rt"][0]
|
|
1985
|
-
logger.debug(
|
|
1927
|
+
logger.debug(
|
|
1928
|
+
f"Library entry {lib_uid} (mz={mz:.4f}, rt={rt}) matched existing consensus {match_uid} (mz={match_mz:.4f}, rt={match_rt})"
|
|
1929
|
+
)
|
|
1986
1930
|
continue
|
|
1987
|
-
|
|
1931
|
+
|
|
1988
1932
|
# No match found, create new consensus feature
|
|
1989
1933
|
# Handle missing RT - use 0 as placeholder
|
|
1990
1934
|
if rt is None:
|
|
1991
1935
|
rt = 0.0
|
|
1992
1936
|
if logger and skipped_count < 5: # Log first few
|
|
1993
1937
|
logger.debug(f"Library entry {lib_uid} has no RT, using 0.0")
|
|
1994
|
-
|
|
1938
|
+
|
|
1995
1939
|
# Calculate RT range based on chrom_fhwm
|
|
1996
1940
|
half_width = chrom_fhwm / 2.0
|
|
1997
1941
|
rt_start = rt - half_width
|
|
1998
1942
|
rt_end = rt + half_width
|
|
1999
|
-
|
|
1943
|
+
|
|
2000
1944
|
# Get adduct information
|
|
2001
1945
|
adduct_top = adduct if adduct else "?"
|
|
2002
1946
|
adduct_charge_top = None
|
|
2003
1947
|
adduct_mass_shift_top = None
|
|
2004
1948
|
adduct_mass_neutral_top = None
|
|
2005
|
-
|
|
1949
|
+
|
|
2006
1950
|
# Parse adduct to get charge and mass shift
|
|
2007
1951
|
if adduct_top and cached_adducts_df is not None and not cached_adducts_df.is_empty():
|
|
2008
1952
|
# Look for exact match in study adducts
|
|
@@ -2011,7 +1955,7 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2011
1955
|
adduct_row = matching_adduct.row(0, named=True)
|
|
2012
1956
|
adduct_charge_top = adduct_row["charge"]
|
|
2013
1957
|
adduct_mass_shift_top = adduct_row["mass_shift"]
|
|
2014
|
-
|
|
1958
|
+
|
|
2015
1959
|
# Fallback to default values if not found
|
|
2016
1960
|
if adduct_charge_top is None:
|
|
2017
1961
|
adduct_charge_top = int(z) if z else 1
|
|
@@ -2029,15 +1973,15 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2029
1973
|
adduct_mass_shift_top = 1.007825
|
|
2030
1974
|
if adduct_top == "?":
|
|
2031
1975
|
adduct_top = "[M+?]1+"
|
|
2032
|
-
|
|
1976
|
+
|
|
2033
1977
|
# Calculate neutral mass
|
|
2034
1978
|
if adduct_charge_top and adduct_mass_shift_top is not None:
|
|
2035
1979
|
adduct_mass_neutral_top = mz * abs(adduct_charge_top) - adduct_mass_shift_top
|
|
2036
|
-
|
|
1980
|
+
|
|
2037
1981
|
# Determine adduct group for isotopologues and related adducts
|
|
2038
1982
|
adduct_group = consensus_uid_counter # Default: each entry gets its own group
|
|
2039
1983
|
adduct_of = 0 # Default: this is the base adduct
|
|
2040
|
-
|
|
1984
|
+
|
|
2041
1985
|
# Track base adducts ([M+H] iso=0 or [M-H] iso=0) for grouping
|
|
2042
1986
|
base_adduct_key = None
|
|
2043
1987
|
if iso == 0 and adduct_top in ["[M+H]+", "[M+H]1+", "[M-H]-", "[M-H]1-"]:
|
|
@@ -2049,21 +1993,22 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2049
1993
|
# Calculate the base m/z (subtract isotope mass shifts)
|
|
2050
1994
|
c13_mass_shift = 1.00335
|
|
2051
1995
|
base_mz = mz - (iso * c13_mass_shift / abs(adduct_charge_top))
|
|
2052
|
-
|
|
1996
|
+
|
|
2053
1997
|
# Look for matching base adduct
|
|
2054
1998
|
for (stored_mz, stored_adduct), stored_group in base_adduct_groups.items():
|
|
2055
1999
|
if abs(stored_mz - base_mz) < mz_tol and stored_adduct == adduct_top:
|
|
2056
2000
|
adduct_group = stored_group
|
|
2057
2001
|
adduct_of = stored_group
|
|
2058
2002
|
break
|
|
2059
|
-
|
|
2003
|
+
|
|
2060
2004
|
# Create adduct values list with proper structure (format: structured data with fields: adduct, count, percentage, mass)
|
|
2061
2005
|
adduct_values = [{"adduct": adduct_top, "count": 1, "percentage": 100.0, "mass": 0.0}]
|
|
2062
|
-
|
|
2006
|
+
|
|
2063
2007
|
# Generate unique consensus_id string
|
|
2064
2008
|
import uuid
|
|
2065
|
-
|
|
2066
|
-
|
|
2009
|
+
|
|
2010
|
+
consensus_id_str = str(uuid.uuid4()).replace("-", "")[:16]
|
|
2011
|
+
|
|
2067
2012
|
# Build consensus metadata with requested modifications for new entries
|
|
2068
2013
|
metadata = {
|
|
2069
2014
|
"consensus_uid": consensus_uid_counter,
|
|
@@ -2096,7 +2041,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2096
2041
|
"adducts": adduct_values,
|
|
2097
2042
|
"adduct_charge_top": adduct_charge_top,
|
|
2098
2043
|
"adduct_group": adduct_group, # Use calculated adduct group
|
|
2099
|
-
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
2044
|
+
"adduct_mass_neutral_top": round(adduct_mass_neutral_top, 6)
|
|
2045
|
+
if adduct_mass_neutral_top is not None
|
|
2046
|
+
else None,
|
|
2100
2047
|
"adduct_mass_shift_top": round(adduct_mass_shift_top, 6) if adduct_mass_shift_top is not None else None,
|
|
2101
2048
|
"adduct_of": adduct_of, # Use calculated adduct_of
|
|
2102
2049
|
"adduct_top": adduct_top,
|
|
@@ -2105,9 +2052,9 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2105
2052
|
"id_top_adduct": None, # Set to null as requested
|
|
2106
2053
|
"id_top_score": None, # Set to null as requested
|
|
2107
2054
|
}
|
|
2108
|
-
|
|
2055
|
+
|
|
2109
2056
|
consensus_metadata.append(metadata)
|
|
2110
|
-
|
|
2057
|
+
|
|
2111
2058
|
# Create mapping entry (maps to library entry as "virtual" feature)
|
|
2112
2059
|
# Use lib_uid as the feature_uid and a virtual sample_uid of 0
|
|
2113
2060
|
# Match existing consensus_mapping_df column order: consensus_uid, feature_uid, sample_uid
|
|
@@ -2116,18 +2063,20 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2116
2063
|
"feature_uid": lib_uid, # Use lib_uid as feature reference
|
|
2117
2064
|
"sample_uid": 0, # Virtual sample for library entries
|
|
2118
2065
|
})
|
|
2119
|
-
|
|
2066
|
+
|
|
2120
2067
|
consensus_uid_counter += 1
|
|
2121
|
-
|
|
2068
|
+
|
|
2122
2069
|
# Log matching statistics
|
|
2123
2070
|
if logger:
|
|
2124
2071
|
total_processed = matched_count + len(consensus_metadata)
|
|
2125
|
-
logger.info(
|
|
2126
|
-
|
|
2072
|
+
logger.info(
|
|
2073
|
+
f"Processed {total_processed} library entries: {matched_count} matched existing consensus features, {len(consensus_metadata)} created new features"
|
|
2074
|
+
)
|
|
2075
|
+
|
|
2127
2076
|
# Convert to DataFrames with proper schema alignment
|
|
2128
2077
|
if consensus_metadata:
|
|
2129
2078
|
new_consensus_df = pl.DataFrame(consensus_metadata, strict=False)
|
|
2130
|
-
|
|
2079
|
+
|
|
2131
2080
|
# Ensure schema compatibility with existing consensus_df
|
|
2132
2081
|
if not study.consensus_df.is_empty():
|
|
2133
2082
|
# Cast columns to match existing schema
|
|
@@ -2143,36 +2092,36 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2143
2092
|
cast_exprs.append(pl.col(col_name).cast(target_dtype, strict=False))
|
|
2144
2093
|
else:
|
|
2145
2094
|
cast_exprs.append(pl.col(col_name))
|
|
2146
|
-
|
|
2095
|
+
|
|
2147
2096
|
new_consensus_df = new_consensus_df.select(cast_exprs)
|
|
2148
|
-
|
|
2097
|
+
|
|
2149
2098
|
new_consensus_mapping_df = pl.DataFrame(consensus_mapping_list, strict=False)
|
|
2150
|
-
|
|
2099
|
+
|
|
2151
2100
|
# Append to existing DataFrames
|
|
2152
2101
|
if not study.consensus_df.is_empty():
|
|
2153
2102
|
study.consensus_df = pl.concat([study.consensus_df, new_consensus_df])
|
|
2154
2103
|
else:
|
|
2155
2104
|
study.consensus_df = new_consensus_df
|
|
2156
|
-
|
|
2105
|
+
|
|
2157
2106
|
if not study.consensus_mapping_df.is_empty():
|
|
2158
2107
|
study.consensus_mapping_df = pl.concat([study.consensus_mapping_df, new_consensus_mapping_df])
|
|
2159
2108
|
else:
|
|
2160
2109
|
study.consensus_mapping_df = new_consensus_mapping_df
|
|
2161
|
-
|
|
2110
|
+
|
|
2162
2111
|
if logger:
|
|
2163
2112
|
logger.info(f"Added {len(consensus_metadata)} consensus features from library")
|
|
2164
2113
|
else:
|
|
2165
2114
|
if logger:
|
|
2166
2115
|
logger.warning("No valid consensus features created from library")
|
|
2167
2116
|
return
|
|
2168
|
-
|
|
2117
|
+
|
|
2169
2118
|
# Store operation in history
|
|
2170
2119
|
if hasattr(study, "update_history"):
|
|
2171
2120
|
study.update_history(
|
|
2172
2121
|
["lib_to_consensus"],
|
|
2173
2122
|
{"chrom_fhwm": chrom_fhwm, "lib_entries": len(study.lib_df)},
|
|
2174
2123
|
)
|
|
2175
|
-
|
|
2124
|
+
|
|
2176
2125
|
# Perform find_ms2 at the end
|
|
2177
2126
|
try:
|
|
2178
2127
|
if hasattr(study, "find_ms2"):
|
|
@@ -2185,6 +2134,6 @@ def lib_to_consensus(study, chrom_fhwm: float = 5.0, mz_tol: float = 0.01, rt_to
|
|
|
2185
2134
|
except Exception as e:
|
|
2186
2135
|
if logger:
|
|
2187
2136
|
logger.warning(f"find_ms2 failed: {e}")
|
|
2188
|
-
|
|
2137
|
+
|
|
2189
2138
|
if logger:
|
|
2190
2139
|
logger.success(f"lib_to_consensus completed: {len(consensus_metadata)} features added")
|